{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kaggle-房价预测"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step1 检视数据源"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_tra = pd.read_csv(\"input/train.csv\", index_col=0)\n",
    "data_tes = pd.read_csv(\"input/test.csv\", index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Corner</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "Id                                                                    \n",
       "1           60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "2           20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "3           60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "4           70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "5           60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "   LandContour Utilities LotConfig    ...     PoolArea PoolQC Fence  \\\n",
       "Id                                    ...                             \n",
       "1          Lvl    AllPub    Inside    ...            0    NaN   NaN   \n",
       "2          Lvl    AllPub       FR2    ...            0    NaN   NaN   \n",
       "3          Lvl    AllPub    Inside    ...            0    NaN   NaN   \n",
       "4          Lvl    AllPub    Corner    ...            0    NaN   NaN   \n",
       "5          Lvl    AllPub       FR2    ...            0    NaN   NaN   \n",
       "\n",
       "   MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "Id                                                                         \n",
       "1          NaN       0      2    2008        WD         Normal     208500  \n",
       "2          NaN       0      5    2007        WD         Normal     181500  \n",
       "3          NaN       0      9    2008        WD         Normal     223500  \n",
       "4          NaN       0      2    2006        WD        Abnorml     140000  \n",
       "5          NaN       0     12    2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_tra.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step2 合并数据（为了处理数据时更加方便，还要把训练集中的label值分离出来）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001FEF129F550>,\n",
       "        <matplotlib.axes._subplots.AxesSubplot object at 0x000001FEF131D550>]],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHLRJREFUeJzt3X+QHOV95/H3xxK/wi/xY9kTks6CQ7cHZxUgK1g+rpwN2AQEQbgOcuJURoA4uWKwodAdFiaVs3N2CpLC/Ehc2DqwETlsUMAYhR82RDB3hyvIRoAlQFZYZIE2khEYEAgC8cL3/uhnYDSa3emZndmZbT6vqqntfvrp7u/09n73mad7nlZEYGZmxfWRTgdgZmbt5URvZlZwTvRmZgXnRG9mVnBO9GZmBedEb2ZWcE70XUzS/ZIWdjoO+3CQtEnSp1uwnZMk/WgU6y+Q9MBo42gFSV+SdGWn4xgt+T76sSVpE9ALvAu8CdwHfDEidnQyLrN0bl4QEX8/yu08BlwUEY+2JLA2k7QM+D1gBnB+RNxcsWxPYACYFRHbOhPh6LlF3xl/GBH7ALOA3wX+pHKhMv7d2Lgj6XeB/ZtN8pImtjik8nZLkvqHWfwL4AvA49ULIuJt4H7gnHbENVacTDooIv6J7CT6WDoRvyHpp8BbwOGp7IJyfUn/VdJ6SW9IekbSrFR+qKQ7Jb0k6VeSvtSZd2RFIGkPSddK2pJe10rao2L5ZZK2pmUXSApJR6TFpwD/p2p7kbpANkp6WdJflhsyks6V9FNJ10h6BfhqKnukYv1/L+lBSa9IelHSV1L5RyQtlfScpN9IWiHpwEbfb0R8KyJWAW8PU6UEnNrodruJE30HSZoGzAWeSEWfAxYD+wLPV9U9C/gqWctiP+B04DfpD+bvyFolU4ATgUsk/cEYvAUrpiuAOcAxwNHAcaRPnZJOBi4FPg0cQdblUWkmsKHGNj8LzCb7FDsPOL9i2SeAjcAhwDcqV5K0L/D3wI+BQ9M+V6XFXwLOSDEcCrwKfKvB95rHerLjMG450XfGjyS9BjxC1vr581R+c0Q8HRFDEfHbqnUuAP4iIn4emYGIeJ6s66cnIv4sIv4lIjYC/wuYP1ZvxgpnAfBnEbEtIl4CvkbWCAH4I+B76Tx9Ky2rNAl4o8Y2r4qIVyLiBeBa4OyKZVsi4q/Sef/PVeudBvw6Iq6OiLcj4o2IWJ2WfR64IiIGI+IdsobQmW3o/nkD2L/F2xxTbekPs7rOqL7gJQlg8wjrTAOeq1H+UeDQ9I+jbALw/0YbpH1oHcrOnyifT2XlZY9VLKs+Z18l+0RarbJe5fZqbaPScOc9ZOf+XZLeqyh7l+xmh3+q+pvYB7hH0lCavzIi8t5Nsy+wPWfdruRE311GugVqM/Bvhin/VUTMaE9I9iG0hSyJPp3m/3UqA9gKTK2oO61q3bXAv62xzWnDbA/qn/dnj7Ds/Ij4aa2FETGpPC2pBHw1Ikoj7Gs4R5J1jY5b7roZP24E/pukj6e7co6Q9FHgZ8Drkr4saS9JEyR9LN39YNaMHwB/IqlH0sHAnwL/Oy1bAZwn6UhJv5OWVbqPXfvtAf67pAPSdamLgdtzxnIP8K8kXZIuEu8r6RNp2beBb6S/A1K883K/y0TS7uk2SgG7Sdqz6q633yO7aWLccqIfJyLib8kuVH2frM/wR8CBEfEu8IdkF85+BbxM9k9hXPcpWkd9nax7Zi2wjuy2w68DRMT9wPXAw2T3l/9DWuedtPxxYHtFMi67G1gDPAncC9yUJ5CIeAP4DNk5/mvgWeD30+LrgJXAA5LeAB4lu7DbqAeAfwb+A7AsTX8K3r+Pfi6wvIntdg1/YcrMmibpSOApYI+IGEplJwFfiIgz0nwAMyJioHORNkfSF4FpEXFZp2MZDSd6M2uIpM+Stcr3JmvpvldO6sPUH7eJvijcdWNmjfo88BLZ3TDvAn/c2XCsHrfozcwKzi16M7OC64r76CdNmhRHHHFE/Ypj6M0332TvvffudBg7cUwjW7NmzcsR0dPpOPI4+OCDY/r06WO+3276fdXjWOvLe853RaLv7e3lscceq19xDJVKJfr7+zsdxk4c08gkPV+/VneYPn16R875bvp91eNY68t7zrvrxsys4JzozcwKzonezKzgnOjNzArOid7MrOCc6M3MCs6J3sys4JzozcwKzonezKzguuKbsa02fem9Da+z6cpT2xCJmUFzf5M3nzw+hj8YD9yiNzMrOCd6M7OCq5voJfVJerLi9Xp6UO+Bkh6U9Gz6eUCqL0nXSxqQtFbSrPa/DTMzG07dRB8RGyLimIg4Bvg48BZwF7AUWBURM4BVaR7gFGBGei0GbmhH4GZmlk+jXTcnAs9FxPPAPD54MvpyoPzMyHnALZF5FJgkaXJLojUzs4Y1etfNfOAHabo3IrYCRMRWSYek8inA5op1BlPZ1soNSVpM1uKnp6eHUqnUYCjDWzJzqOF1qve/Y8eOlsbUCo7JzJqRO9FL2h04Hbi8XtUaZbs8mDYilgHLAPr6+qKVg/af28ztlQt23n83PvTAMZlZMxrpujkFeDwiXkzzL5a7ZNLPbal8EJhWsd5UYMtoAzUzs+Y0kujP5oNuG4CVwMI0vRC4u6L8nHT3zRxge7mLx8zMxl6urhtJvwN8Bvh8RfGVwApJi4AXgLNS+X3AXGCA7A6d81oWrZmZNSxXoo+It4CDqsp+Q3YXTnXdAC5sSXRmZjZq/masmVnBOdGbmRWcE72ZWcE50ZuZFZwTvZlZwTnRm5kVnBO9mVnBOdGbmRWcE72ZWcE50ZuZFZwTvZlZwTnRm5kVnBO9mVnBOdGbmRWcE72ZWcE50ZsNQ9IESU9IuifNHyZptaRnJd2enqOMpD3S/EBaPr2TcZtVc6I3G97FwPqK+auAayJiBvAqsCiVLwJejYgjgGtSPbOu4URvVoOkqcCpwI1pXsAJwB2pynLgjDQ9L82Tlp+Y6pt1BSd6s9quBS4D3kvzBwGvRcRQmh8EpqTpKcBmgLR8O1WP3jTrpFzPjDX7MJF0GrAtItZI6i8X16gaOZZVbncxsBigt7eXUqk0+mAbtGPHjo7sd8nMofqVqmx7ZTt/devdDa0zc8r+De+nFTp1XPPKleglTSL7CPsxshP4fGADcDswHdgE/FFEvJo+sl4HzAXeAs6NiMdbHrlZ+xwPnC5pLrAnsB9ZC3+SpImp1T4V2JLqDwLTgEFJE4H9gVeqNxoRy4BlALNnz47+/v52v49dlEolOrHfc5fe2/A6S2YOcfW6xtqimxb0N7yfVujUcc0rb9fNdcCPI+LfAUeTXaBaCqxKF6ZWpXmAU4AZ6bUYuKGlEZu1WURcHhFTI2I6MB94KCIWAA8DZ6ZqC4Fyc3NlmictfygidmnRm3VK3UQvaT/gU8BNABHxLxHxGjtfgKq+MHVLZB4lawVNbnnkZmPvy8ClkgbI+uBvSuU3AQel8kv5oNFj1hXyfC46HHgJ+J6ko4E1ZLed9UbEVoCI2CrpkFT//QtTSfmi1dbKjVb2V/b09LS0f6uZ/sDq/Xdjn5tjGnsRUQJKaXojcFyNOm8DZ41pYGYNyJPoJwKzgC9GxGpJ1zFyiyXXhanK/sq+vr6W9lc20x9Y3bfXjX1ujsnMmpGnj34QGIyI1Wn+DrLE/2K5Syb93FZRf1rF+pUXrczMbIzVTfQR8Wtgs6S+VHQi8Aw7X4CqvjB1jjJzgO3lLh4zMxt7ee9d+iJwaxrbYyNwHtk/iRWSFgEv8EEf5X1kt1YOkN1eeV5LIzYzs4bkSvQR8SQwu8aiE2vUDeDCUcZlZmYt4iEQzMwKzonezKzgnOjNzArOid7MrOCc6M3MCs6J3sys4JzozcwKzonezKzgnOjNzArOid7MrOCc6M3MCs6J3sys4JzozcwKzonezKzgnOjNzArOid7MrOCc6M3MCs6J3sys4JzozcwKLleil7RJ0jpJT0p6LJUdKOlBSc+mnwekckm6XtKApLWSZrXzDZiZ2cgaadH/fkQcExHlh4QvBVZFxAxgVZoHOAWYkV6LgRtaFayZmTVuNF0384DlaXo5cEZF+S2ReRSYJGnyKPZjZmajkDfRB/CApDWSFqey3ojYCpB+HpLKpwCbK9YdTGVmZtYBE3PWOz4itkg6BHhQ0i9HqKsaZbFLpewfxmKAnp4eSqVSzlDqWzJzqOF1qve/Y8eOlsbUCo7JzJqRK9FHxJb0c5uku4DjgBclTY6IralrZluqPghMq1h9KrClxjaXAcsA+vr6or+/v+k3Ue3cpfc2vM6mBTvvv1Qq0cqYWsExmVkz6nbdSNpb0r7laeAk4ClgJbAwVVsI3J2mVwLnpLtv5gDby108ZmY29vK06HuBuySV638/In4s6efACkmLgBeAs1L9+4C5wADwFnBey6M2M7Pc6ib6iNgIHF2j/DfAiTXKA7iwJdGZmdmo+ZuxZmYF50RvZlZwTvRmVSTtKelnkn4h6WlJX0vlh0lanYb9uF3S7ql8jzQ/kJZP72T8ZtWc6M129Q5wQkQcDRwDnJzuILsKuCYN+/EqsCjVXwS8GhFHANekemZdw4nerEoavmNHmt0tvQI4AbgjlVcP+1EeDuQO4ESl29TMuoETvVkNkiZIepLsi4APAs8Br0VE+WvXlUN7vD/sR1q+HThobCM2G17eIRDMPlQi4l3gGEmTgLuAI2tVSz8bHvajt7e3I0NHdGrIimaGJendq/H1OjUcR7cPBeJEbzaCiHhNUgmYQzYS68TUaq8c2qM87MegpInA/sArNbb1/rAfs2fPbumwH3l1asiKZoYlWTJziKvXNZaiqocyGSvdPhSIu27MqkjqSS15JO0FfBpYDzwMnJmqVQ/7UR4O5EzgofTFQbOu4Ba92a4mA8slTSBrDK2IiHskPQPcJunrwBPATan+TcDfSBoga8nP70TQZsNxojerEhFrgWNrlG8kG7m1uvxtPhjryazruOvGzKzgnOjNzArOid7MrOCc6M3MCs6J3sys4JzozcwKzonezKzgnOjNzArOid7MrOByJ/o0bOsTku5J837ajpnZONBIi/5isoGdyvy0HTOzcSBXopc0FTgVuDHNCz9tx8xsXMg7qNm1wGXAvmn+IHI+bUdS+Wk7L1dusPIhDD09PS0dtL+ZhxxU778bHyTgmMysGXUTvaTTgG0RsUZSf7m4RtWGnrZT+RCGvr6+lj6EoZmHHFQ/sKAbHyTgmMysGXla9McDp0uaC+wJ7EfWwh/V03bMzGxs1O2jj4jLI2JqREwne6DCQxGxAD9tx8xsXBjNffRfBi5NT9U5iJ2ftnNQKr8UWDq6EM3MbDQaesJURJSAUpr203bMzMYBfzPWzKzgnOjNzArODwdPplfdkrlk5lDd2zQ3XXlqO0MyM2sJt+jNzArOid7MrOCc6M3MCs6J3sys4JzozcwKzonezKzgnOjNzArOid7MrOCc6M3MCs7fjDWzwqj+hnteRf+Wu1v0ZmYF50RvZlZwTvRmZgXnRG9mVnBO9GZmBedEb1ZF0jRJD0taL+lpSRen8gMlPSjp2fTzgFQuSddLGpC0VtKszr4Ds5050ZvtaghYEhFHAnOACyUdRfag+1URMQNYxQcPvj8FmJFei4Ebxj5ks+HVTfSS9pT0M0m/SK2br6XywyStTq2b2yXtnsr3SPMDafn09r4Fs9aKiK0R8XiafgNYD0wB5gHLU7XlwBlpeh5wS2QeBSZJmjzGYZsNK88Xpt4BToiIHZJ2Ax6RdD9wKXBNRNwm6dvAIrKWzCLg1Yg4QtJ84CrgP7cpfrO2Sg2VY4HVQG9EbIXsn4GkQ1K1KcDmitUGU9nWqm0tJmvx09vbS6lUamfoNe3YsaMj+10yc6jhdXr3am69Zoz2mHTquOZVN9FHRAA70uxu6RXACcB/SeXLga+SJfp5aRrgDuCvJSltx2zckLQPcCdwSUS8LmnYqjXKdjnfI2IZsAxg9uzZ0d/f36JI8yuVSnRiv/Wev1zLkplDXL1ubL68v2lB/6jW79RxzSvXUZQ0AVgDHAF8C3gOeC0iyv9uyy0YqGjdRMSQpO3AQcDLVdt8v3XT09PT0v+GrWgF5GlNjPV/8G5sNXRjTK2QPr3eCdwaET9MxS9Kmpxa85OBbal8EJhWsfpUYMvYRWs2slyJPiLeBY6RNAm4CziyVrX0s+HWTV9fX0tbN820HqrlaU2MthXQqG5sNXRjTKOlrOl+E7A+Ir5ZsWglsBC4Mv28u6L8Ikm3AZ8Atpe7eMy6QUOfiyLiNUklsjsRJkmamFr1lS2YcutmUNJEYH/gldaFbNZ2xwOfA9ZJejKVfYUswa+QtAh4ATgrLbsPmAsMAG8B541tuGYjq5voJfUAv01Jfi/g02QXWB8GzgRuY9fWzULgH9Lyh9w/b+NJRDxC7U+mACfWqB/AhW0NymwU8rToJwPLUz/9R4AVEXGPpGeA2yR9HXiC7KMu6effSBoga8nPb0PcZmaWU567btaS3V5WXb4ROK5G+dt88JHWzAqm2THfrXP8zVgzs4JzojczKzgnejOzgnOiNzMrOCd6M7OCc6I3Mys4J3ozs4JzojczKzgnejOzgnOiNzMrOCd6M7OCc6I3Mys4J3ozs4JzojczKzgnejOzgnOiNzMrOCd6M7OCc6I3Mys4J3ozs4Krm+glTZP0sKT1kp6WdHEqP1DSg5KeTT8PSOWSdL2kAUlrJc1q95swM7Ph5WnRDwFLIuJIYA5woaSjgKXAqoiYAaxK8wCnADPSazFwQ8ujNjOz3Oom+ojYGhGPp+k3gPXAFGAesDxVWw6ckabnAbdE5lFgkqTJLY/czMxyaaiPXtJ04FhgNdAbEVsh+2cAHJKqTQE2V6w2mMrMzKwDJuatKGkf4E7gkoh4XdKwVWuURY3tLSbr2qGnp4dSqZQ3lLqWzBwa9TZ696q/nVbGnMeOHTvGfJ/1dGNMZrazXIle0m5kSf7WiPhhKn5R0uSI2Jq6Zral8kFgWsXqU4Et1duMiGXAMoC+vr7o7+9v7h3UcO7Se0e9jSUzh7h63ciHZ9OC/lHvpxGlUolWHqdW6MaYzGxnee66EXATsD4ivlmxaCWwME0vBO6uKD8n3X0zB9he7uIxM7Oxl6dFfzzwOWCdpCdT2VeAK4EVkhYBLwBnpWX3AXOBAeAt4LyWRmxmZg2pm+gj4hFq97sDnFijfgAXjjIuMzNrEX8z1sys4JzozcwKzonezKzgnOjNzArOid7MrOCc6M1qkPRdSdskPVVR5hFbbVxyojer7Wbg5Koyj9hq45ITvVkNEfF/gVeqij1iq41LuQc1M7OdR2yVVG/E1p2G/qgcyK+3t7cjg8G1YhC6VgwamEeegQVbZbTHpNsH9+v6RD+9BQOUmbVZrhFbKwfymz17dksH8surFYPQtWLQwDzyDCzYKqMdoLDbB/dz141Zfi+Wu2SaGbHVrFOc6M3y84itNi51fdeNWSdI+gHQDxwsaRD4H3jE1sJqpot405WntiGS9nCiH4Vmrx+MpxPkwyoizh5mkUdstXHHXTdmZgXnRG9mVnBO9GZmBedEb2ZWcE70ZmYF50RvZlZwdRO9h2s1Mxvf8rTob8bDtZqZjVt1E72HazUzG9+a/WbsqIZrhZ2HbO3p6Rl2iM+xGqa0WjuHSG12ONNuHAq1G2Mys521egiEXMO1ws5Dtvb19Q07ZOtYDYlarZ1DpDY7JGo3DoXajTGZ2c6avevGw7WamY0TzSZ6D9dqZjZO1O2b8HCtZmbjW91E7+FazYrJj+n88PA3Y83MCs6J3sys4JzozcwKzonezKzgnOjNzArOid7MrOCc6M3MCs6J3sys4NozapeNqJkvqmy68tQ2RGJmHwZu0ZuZFZwTvZlZwTnRm5kVnBO9mVnBOdGbmRWc77oxM2tC5d1zS2YO5XrsaafunnOL3sys4JzozcwKzl0348T0pffm/nhY5i9ZmRk40ZsVQt5vWzfaWLBicNeNmVnBtaVFL+lk4DpgAnBjRFzZjv2YdROf91ZPp8a5anmilzQB+BbwGWAQ+LmklRHxTKv3ZdYtWnneN5MMzEbSjhb9ccBARGwEkHQbMA9woh9jY5EwxrrPt4svMPu8t66liGjtBqUzgZMj4oI0/zngExFxUVW9xcDiNPsx4KmWBjJ6BwMvdzqIKo5pZB+NiJ5O7DjPeV91zvcBG8Y80O76fdXjWOvLdc63o0WvGmW7/DeJiGXAMgBJj0XE7DbE0jTHlE83xtQhdc/7ynO+U8bT78uxtk477roZBKZVzE8FtrRhP2bdxOe9da12JPqfAzMkHSZpd2A+sLIN+zHrJj7vrWu1vOsmIoYkXQT8hOw2s+9GxNN1Vuvox9lhOKZ8ujGmMdfked8J4+n35VhbpOUXY83MrLv4m7FmZgXnRG9mVnAdT/SSTpa0QdKApKUt2N40SQ9LWi/paUkXp/IDJT0o6dn084BULknXp/2vlTSrYlsLU/1nJS2sKP+4pHVpneslaaR9pGUTJD0h6Z40f5ik1anu7ekCHpL2SPMDafn0im1cnso3SPqDesdwuH2kZZMk3SHpl+lYfbLTx8haS9J3JW2T9FRF2Vnp7+I9SV1zO+Awsf5lOj/XSrpL0qROxlg2TKz/M8X5pKQHJB3ayRh3EREde5FdtHoOOBzYHfgFcNQotzkZmJWm9wX+ETgK+AtgaSpfClyVpucC95PdBz0HWJ3KDwQ2pp8HpOkD0rKfAZ9M69wPnJLKa+4jzV8KfB+4J82vAOan6W8Df5ymvwB8O03PB25P00el47MHcFg6bhNGOobD7SPNLwcuSNO7A5M6fYz8avnf16eAWcBTFWVHkn1ZqwTM7nSMdWI9CZiYpq/qlnNlmFj3q5j+UvlvuFtenT5gnwR+UjF/OXB5i/dxN9n4IxuAyalsMrAhTX8HOLui/oa0/GzgOxXl30llk4FfVpS/X2+EfUwFVgEnAPek5PdyxUn8/nEgu2vjk2l6Yqqn6mNTrjfcMayzj/2AX5Euxle/904cI7/a9jc2vTIhVZSX6KJEP1KsadlngVs7HWPOWC8Hbuh0jJWvTnfdTAE2V8wPprKWSN0exwKrgd6I2AqQfh5SJ4aRygeHiXm4fVwLXAa8l+YPAl6LiKEa23h/v2n59lS/0ThH2sfhwEvA91J30o2S9u7wMTIbyflknwy7lqRvSNoMLAD+tNPxVOp0os81XEJTG5b2Ae4ELomI15uIodHy4eI4DdgWEWty7LOV8Yy0j4lkHz1viIhjgTfJulGG09ZjZDYSSVcAQ8CtnY5lJBFxRURMI4vzonr1x1KnE31bvjYuaTeyJH9rRPwwFb8oaXJaPhnYVieGkcqnDhNzrX0cD5wuaRNwG1n3zbXAJEkTa2zj/f2m5fsDrzQR58t19jEYEavT/B1kib9Tx8ispnSB/zRgQaR+kXHg+8B/6nQQlTqd6Fv+tfF0d8dNwPqI+GbFopVA+a6QhWR99+Xyc9KdJXOA7alL4SfASZIOSHeGnETWx70VeEPSnLSvc6q2tdM+IuLyiJgaEdPT+3soIhYADwNnDhNPeRtnpvqRyuenu3IOA2aQXfCseQzTOjX3ERG/BjZL6kvLTiQbTrcjxwizGpQ9yOXLwOkR8Van4xmJpBkVs6cDv+xULDV1+iIB2R0d/0h258gVLdjefyTrJlgLPJlec8n6rFcBz6afB6b6IntgxHPAOiouUJH1Cw6k13kV5bPJhlV+DvhrPviGcc19VKzXzwd33RxOlqgHgL8F9kjle6b5gbT88Ir1r0j73EC6i2WkYzjcPtKyY4DH0nH6EdldMx0/Rn619G/rB8BW4Ldkn7IWkV3UHATeAV6k4kJ+F8Y6QHYNqPx33BV3sgwT653pfF8L/B0wpdNxVr48BIKZWcF1uuvGzMzazInezKzgnOjNzArOid7MrOCc6M3MCs6J3sys4JzozcwK7v8DIMTdCHcRJN8AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1fef1260eb8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "#首先，看一下标签值是否平滑，是否满足一个标准分布\n",
    "%matplotlib inline\n",
    "prices = pd.DataFrame({\"Price\":data_tra[\"SalePrice\"], \"log(price+1)\":np.log1p(data_tra[\"SalePrice\"])})\n",
    "prices.hist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = np.log1p(data_tra.pop(\"SalePrice\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#合并数据集\n",
    "data_all = pd.concat((data_tra, data_tes), axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step3 变量转化"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### category数据转换成numerical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass_120</th>\n",
       "      <th>MSSubClass_150</th>\n",
       "      <th>MSSubClass_160</th>\n",
       "      <th>MSSubClass_180</th>\n",
       "      <th>MSSubClass_190</th>\n",
       "      <th>MSSubClass_20</th>\n",
       "      <th>MSSubClass_30</th>\n",
       "      <th>MSSubClass_40</th>\n",
       "      <th>MSSubClass_45</th>\n",
       "      <th>MSSubClass_50</th>\n",
       "      <th>MSSubClass_60</th>\n",
       "      <th>MSSubClass_70</th>\n",
       "      <th>MSSubClass_75</th>\n",
       "      <th>MSSubClass_80</th>\n",
       "      <th>MSSubClass_85</th>\n",
       "      <th>MSSubClass_90</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    MSSubClass_120  MSSubClass_150  MSSubClass_160  MSSubClass_180  \\\n",
       "Id                                                                   \n",
       "1                0               0               0               0   \n",
       "2                0               0               0               0   \n",
       "3                0               0               0               0   \n",
       "4                0               0               0               0   \n",
       "5                0               0               0               0   \n",
       "\n",
       "    MSSubClass_190  MSSubClass_20  MSSubClass_30  MSSubClass_40  \\\n",
       "Id                                                                \n",
       "1                0              0              0              0   \n",
       "2                0              1              0              0   \n",
       "3                0              0              0              0   \n",
       "4                0              0              0              0   \n",
       "5                0              0              0              0   \n",
       "\n",
       "    MSSubClass_45  MSSubClass_50  MSSubClass_60  MSSubClass_70  MSSubClass_75  \\\n",
       "Id                                                                              \n",
       "1               0              0              1              0              0   \n",
       "2               0              0              0              0              0   \n",
       "3               0              0              1              0              0   \n",
       "4               0              0              0              1              0   \n",
       "5               0              0              1              0              0   \n",
       "\n",
       "    MSSubClass_80  MSSubClass_85  MSSubClass_90  \n",
       "Id                                               \n",
       "1               0              0              0  \n",
       "2               0              0              0  \n",
       "3               0              0              0  \n",
       "4               0              0              0  \n",
       "5               0              0              0  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#MSSubClass是一个category，数字意义不大，需要转换成string\n",
    "data_all[\"MSSubClass\"] = data_all[\"MSSubClass\"].astype(str)\n",
    "data_all[\"MSSubClass\"].value_counts()\n",
    "#对字符串进行One-hot编码\n",
    "pd.get_dummies(data_all[\"MSSubClass\"], prefix=\"MSSubClass\").head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#因为category都是字符串类型的，所以要将字符串类型的category进行One-hot编码\n",
    "data_all_dummy = pd.get_dummies(data_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>BsmtUnfSF</th>\n",
       "      <th>...</th>\n",
       "      <th>SaleType_ConLw</th>\n",
       "      <th>SaleType_New</th>\n",
       "      <th>SaleType_Oth</th>\n",
       "      <th>SaleType_WD</th>\n",
       "      <th>SaleCondition_Abnorml</th>\n",
       "      <th>SaleCondition_AdjLand</th>\n",
       "      <th>SaleCondition_Alloca</th>\n",
       "      <th>SaleCondition_Family</th>\n",
       "      <th>SaleCondition_Normal</th>\n",
       "      <th>SaleCondition_Partial</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>2003</td>\n",
       "      <td>2003</td>\n",
       "      <td>196.0</td>\n",
       "      <td>706.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>150.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>6</td>\n",
       "      <td>8</td>\n",
       "      <td>1976</td>\n",
       "      <td>1976</td>\n",
       "      <td>0.0</td>\n",
       "      <td>978.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>284.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>2001</td>\n",
       "      <td>2002</td>\n",
       "      <td>162.0</td>\n",
       "      <td>486.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>434.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>1915</td>\n",
       "      <td>1970</td>\n",
       "      <td>0.0</td>\n",
       "      <td>216.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>540.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "      <td>2000</td>\n",
       "      <td>2000</td>\n",
       "      <td>350.0</td>\n",
       "      <td>655.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>490.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 303 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \\\n",
       "Id                                                                            \n",
       "1          65.0     8450            7            5       2003          2003   \n",
       "2          80.0     9600            6            8       1976          1976   \n",
       "3          68.0    11250            7            5       2001          2002   \n",
       "4          60.0     9550            7            5       1915          1970   \n",
       "5          84.0    14260            8            5       2000          2000   \n",
       "\n",
       "    MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF          ...            \\\n",
       "Id                                                         ...             \n",
       "1        196.0       706.0         0.0      150.0          ...             \n",
       "2          0.0       978.0         0.0      284.0          ...             \n",
       "3        162.0       486.0         0.0      434.0          ...             \n",
       "4          0.0       216.0         0.0      540.0          ...             \n",
       "5        350.0       655.0         0.0      490.0          ...             \n",
       "\n",
       "    SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \\\n",
       "Id                                                            \n",
       "1                0             0             0            1   \n",
       "2                0             0             0            1   \n",
       "3                0             0             0            1   \n",
       "4                0             0             0            1   \n",
       "5                0             0             0            1   \n",
       "\n",
       "    SaleCondition_Abnorml  SaleCondition_AdjLand  SaleCondition_Alloca  \\\n",
       "Id                                                                       \n",
       "1                       0                      0                     0   \n",
       "2                       0                      0                     0   \n",
       "3                       0                      0                     0   \n",
       "4                       1                      0                     0   \n",
       "5                       0                      0                     0   \n",
       "\n",
       "    SaleCondition_Family  SaleCondition_Normal  SaleCondition_Partial  \n",
       "Id                                                                     \n",
       "1                      0                     1                      0  \n",
       "2                      0                     1                      0  \n",
       "3                      0                     1                      0  \n",
       "4                      0                     0                      0  \n",
       "5                      0                     1                      0  \n",
       "\n",
       "[5 rows x 303 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_dummy.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### numerical数据的处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LotFrontage          486\n",
       "GarageYrBlt          159\n",
       "MasVnrArea            23\n",
       "BsmtHalfBath           2\n",
       "BsmtFullBath           2\n",
       "BsmtFinSF2             1\n",
       "GarageCars             1\n",
       "TotalBsmtSF            1\n",
       "BsmtUnfSF              1\n",
       "GarageArea             1\n",
       "BsmtFinSF1             1\n",
       "Condition1_Artery      0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_dummy.isnull().sum().sort_values(ascending=False).head(12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LotFrontage        69.305795\n",
       "LotArea         10168.114080\n",
       "OverallQual         6.089072\n",
       "OverallCond         5.564577\n",
       "YearBuilt        1971.312778\n",
       "YearRemodAdd     1984.264474\n",
       "MasVnrArea        102.201312\n",
       "BsmtFinSF1        441.423235\n",
       "BsmtFinSF2         49.582248\n",
       "BsmtUnfSF         560.772104\n",
       "TotalBsmtSF      1051.777587\n",
       "1stFlrSF         1159.581706\n",
       "dtype: float64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_cols = data_all_dummy.mean()\n",
    "mean_cols.head(12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_dummy = data_all_dummy.fillna(mean_cols)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>BsmtUnfSF</th>\n",
       "      <th>...</th>\n",
       "      <th>SaleType_ConLw</th>\n",
       "      <th>SaleType_New</th>\n",
       "      <th>SaleType_Oth</th>\n",
       "      <th>SaleType_WD</th>\n",
       "      <th>SaleCondition_Abnorml</th>\n",
       "      <th>SaleCondition_AdjLand</th>\n",
       "      <th>SaleCondition_Alloca</th>\n",
       "      <th>SaleCondition_Family</th>\n",
       "      <th>SaleCondition_Normal</th>\n",
       "      <th>SaleCondition_Partial</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "      <td>2919.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>69.305795</td>\n",
       "      <td>10168.114080</td>\n",
       "      <td>6.089072</td>\n",
       "      <td>5.564577</td>\n",
       "      <td>1971.312778</td>\n",
       "      <td>1984.264474</td>\n",
       "      <td>102.201312</td>\n",
       "      <td>441.423235</td>\n",
       "      <td>49.582248</td>\n",
       "      <td>560.772104</td>\n",
       "      <td>...</td>\n",
       "      <td>0.002741</td>\n",
       "      <td>0.081877</td>\n",
       "      <td>0.002398</td>\n",
       "      <td>0.865022</td>\n",
       "      <td>0.065091</td>\n",
       "      <td>0.004111</td>\n",
       "      <td>0.008222</td>\n",
       "      <td>0.015759</td>\n",
       "      <td>0.822885</td>\n",
       "      <td>0.083933</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>21.312345</td>\n",
       "      <td>7886.996359</td>\n",
       "      <td>1.409947</td>\n",
       "      <td>1.113131</td>\n",
       "      <td>30.291442</td>\n",
       "      <td>20.894344</td>\n",
       "      <td>178.626089</td>\n",
       "      <td>455.532750</td>\n",
       "      <td>169.176615</td>\n",
       "      <td>439.468337</td>\n",
       "      <td>...</td>\n",
       "      <td>0.052289</td>\n",
       "      <td>0.274225</td>\n",
       "      <td>0.048920</td>\n",
       "      <td>0.341758</td>\n",
       "      <td>0.246728</td>\n",
       "      <td>0.063996</td>\n",
       "      <td>0.090317</td>\n",
       "      <td>0.124562</td>\n",
       "      <td>0.381832</td>\n",
       "      <td>0.277335</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>21.000000</td>\n",
       "      <td>1300.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1872.000000</td>\n",
       "      <td>1950.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>60.000000</td>\n",
       "      <td>7478.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1953.500000</td>\n",
       "      <td>1965.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>220.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>69.305795</td>\n",
       "      <td>9453.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1973.000000</td>\n",
       "      <td>1993.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>369.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>467.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>78.000000</td>\n",
       "      <td>11570.000000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2001.000000</td>\n",
       "      <td>2004.000000</td>\n",
       "      <td>163.500000</td>\n",
       "      <td>733.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>805.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>313.000000</td>\n",
       "      <td>215245.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>1600.000000</td>\n",
       "      <td>5644.000000</td>\n",
       "      <td>1526.000000</td>\n",
       "      <td>2336.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 303 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       LotFrontage        LotArea  OverallQual  OverallCond    YearBuilt  \\\n",
       "count  2919.000000    2919.000000  2919.000000  2919.000000  2919.000000   \n",
       "mean     69.305795   10168.114080     6.089072     5.564577  1971.312778   \n",
       "std      21.312345    7886.996359     1.409947     1.113131    30.291442   \n",
       "min      21.000000    1300.000000     1.000000     1.000000  1872.000000   \n",
       "25%      60.000000    7478.000000     5.000000     5.000000  1953.500000   \n",
       "50%      69.305795    9453.000000     6.000000     5.000000  1973.000000   \n",
       "75%      78.000000   11570.000000     7.000000     6.000000  2001.000000   \n",
       "max     313.000000  215245.000000    10.000000     9.000000  2010.000000   \n",
       "\n",
       "       YearRemodAdd   MasVnrArea   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF  \\\n",
       "count   2919.000000  2919.000000  2919.000000  2919.000000  2919.000000   \n",
       "mean    1984.264474   102.201312   441.423235    49.582248   560.772104   \n",
       "std       20.894344   178.626089   455.532750   169.176615   439.468337   \n",
       "min     1950.000000     0.000000     0.000000     0.000000     0.000000   \n",
       "25%     1965.000000     0.000000     0.000000     0.000000   220.000000   \n",
       "50%     1993.000000     0.000000   369.000000     0.000000   467.000000   \n",
       "75%     2004.000000   163.500000   733.000000     0.000000   805.000000   \n",
       "max     2010.000000  1600.000000  5644.000000  1526.000000  2336.000000   \n",
       "\n",
       "               ...            SaleType_ConLw  SaleType_New  SaleType_Oth  \\\n",
       "count          ...               2919.000000   2919.000000   2919.000000   \n",
       "mean           ...                  0.002741      0.081877      0.002398   \n",
       "std            ...                  0.052289      0.274225      0.048920   \n",
       "min            ...                  0.000000      0.000000      0.000000   \n",
       "25%            ...                  0.000000      0.000000      0.000000   \n",
       "50%            ...                  0.000000      0.000000      0.000000   \n",
       "75%            ...                  0.000000      0.000000      0.000000   \n",
       "max            ...                  1.000000      1.000000      1.000000   \n",
       "\n",
       "       SaleType_WD  SaleCondition_Abnorml  SaleCondition_AdjLand  \\\n",
       "count  2919.000000            2919.000000            2919.000000   \n",
       "mean      0.865022               0.065091               0.004111   \n",
       "std       0.341758               0.246728               0.063996   \n",
       "min       0.000000               0.000000               0.000000   \n",
       "25%       1.000000               0.000000               0.000000   \n",
       "50%       1.000000               0.000000               0.000000   \n",
       "75%       1.000000               0.000000               0.000000   \n",
       "max       1.000000               1.000000               1.000000   \n",
       "\n",
       "       SaleCondition_Alloca  SaleCondition_Family  SaleCondition_Normal  \\\n",
       "count           2919.000000           2919.000000           2919.000000   \n",
       "mean               0.008222              0.015759              0.822885   \n",
       "std                0.090317              0.124562              0.381832   \n",
       "min                0.000000              0.000000              0.000000   \n",
       "25%                0.000000              0.000000              1.000000   \n",
       "50%                0.000000              0.000000              1.000000   \n",
       "75%                0.000000              0.000000              1.000000   \n",
       "max                1.000000              1.000000              1.000000   \n",
       "\n",
       "       SaleCondition_Partial  \n",
       "count            2919.000000  \n",
       "mean                0.083933  \n",
       "std                 0.277335  \n",
       "min                 0.000000  \n",
       "25%                 0.000000  \n",
       "50%                 0.000000  \n",
       "75%                 0.000000  \n",
       "max                 1.000000  \n",
       "\n",
       "[8 rows x 303 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_dummy.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 标准化所有的numerical数据(x-x')/s,在这里，不需要把One-hot的那些数据标准化，目标是那些本来就是numerical的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt',\n",
       "       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',\n",
       "       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',\n",
       "       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',\n",
       "       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',\n",
       "       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',\n",
       "       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',\n",
       "       'MoSold', 'YrSold'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#找到所有的numerical数据\n",
    "numerical_cols = data_all.columns[data_all.dtypes !=\"object\"]\n",
    "numerical_cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "numerical_means = data_all_dummy.loc[:, numerical_cols].mean()\n",
    "numerical_std = data_all_dummy.loc[:, numerical_cols].std()\n",
    "data_all_dummy.loc[:, numerical_cols] = (data_all_dummy.loc[:, numerical_cols]-numerical_means)/numerical_std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>BsmtUnfSF</th>\n",
       "      <th>...</th>\n",
       "      <th>SaleType_ConLw</th>\n",
       "      <th>SaleType_New</th>\n",
       "      <th>SaleType_Oth</th>\n",
       "      <th>SaleType_WD</th>\n",
       "      <th>SaleCondition_Abnorml</th>\n",
       "      <th>SaleCondition_AdjLand</th>\n",
       "      <th>SaleCondition_Alloca</th>\n",
       "      <th>SaleCondition_Family</th>\n",
       "      <th>SaleCondition_Normal</th>\n",
       "      <th>SaleCondition_Partial</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.238851</td>\n",
       "      <td>-0.058339</td>\n",
       "      <td>-0.405244</td>\n",
       "      <td>-0.398335</td>\n",
       "      <td>0.077596</td>\n",
       "      <td>-0.241973</td>\n",
       "      <td>-0.215070</td>\n",
       "      <td>-0.222119</td>\n",
       "      <td>-0.200343</td>\n",
       "      <td>-0.081305</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.186298</td>\n",
       "      <td>-0.060928</td>\n",
       "      <td>-0.692984</td>\n",
       "      <td>2.032037</td>\n",
       "      <td>-0.800143</td>\n",
       "      <td>-1.042991</td>\n",
       "      <td>-0.199736</td>\n",
       "      <td>0.130060</td>\n",
       "      <td>-0.200343</td>\n",
       "      <td>-0.389152</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.248084</td>\n",
       "      <td>-0.060093</td>\n",
       "      <td>-0.405244</td>\n",
       "      <td>-0.398335</td>\n",
       "      <td>-0.032108</td>\n",
       "      <td>-0.345305</td>\n",
       "      <td>-0.263696</td>\n",
       "      <td>-0.332039</td>\n",
       "      <td>-0.200343</td>\n",
       "      <td>-0.591413</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.201526</td>\n",
       "      <td>-0.060870</td>\n",
       "      <td>-0.405244</td>\n",
       "      <td>-0.398335</td>\n",
       "      <td>2.014349</td>\n",
       "      <td>-0.660007</td>\n",
       "      <td>-0.199736</td>\n",
       "      <td>-0.253125</td>\n",
       "      <td>-0.200343</td>\n",
       "      <td>-0.643671</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>-0.130603</td>\n",
       "      <td>-0.044755</td>\n",
       "      <td>0.582714</td>\n",
       "      <td>-0.398335</td>\n",
       "      <td>-0.084279</td>\n",
       "      <td>-0.534971</td>\n",
       "      <td>0.274752</td>\n",
       "      <td>-0.261530</td>\n",
       "      <td>-0.200343</td>\n",
       "      <td>-0.628377</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 303 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  YearRemodAdd  \\\n",
       "Id                                                                             \n",
       "1     -0.238851 -0.058339    -0.405244    -0.398335   0.077596     -0.241973   \n",
       "2     -0.186298 -0.060928    -0.692984     2.032037  -0.800143     -1.042991   \n",
       "3     -0.248084 -0.060093    -0.405244    -0.398335  -0.032108     -0.345305   \n",
       "4     -0.201526 -0.060870    -0.405244    -0.398335   2.014349     -0.660007   \n",
       "5     -0.130603 -0.044755     0.582714    -0.398335  -0.084279     -0.534971   \n",
       "\n",
       "    MasVnrArea  BsmtFinSF1  BsmtFinSF2  BsmtUnfSF          ...            \\\n",
       "Id                                                         ...             \n",
       "1    -0.215070   -0.222119   -0.200343  -0.081305          ...             \n",
       "2    -0.199736    0.130060   -0.200343  -0.389152          ...             \n",
       "3    -0.263696   -0.332039   -0.200343  -0.591413          ...             \n",
       "4    -0.199736   -0.253125   -0.200343  -0.643671          ...             \n",
       "5     0.274752   -0.261530   -0.200343  -0.628377          ...             \n",
       "\n",
       "    SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \\\n",
       "Id                                                            \n",
       "1                0             0             0            1   \n",
       "2                0             0             0            1   \n",
       "3                0             0             0            1   \n",
       "4                0             0             0            1   \n",
       "5                0             0             0            1   \n",
       "\n",
       "    SaleCondition_Abnorml  SaleCondition_AdjLand  SaleCondition_Alloca  \\\n",
       "Id                                                                       \n",
       "1                       0                      0                     0   \n",
       "2                       0                      0                     0   \n",
       "3                       0                      0                     0   \n",
       "4                       1                      0                     0   \n",
       "5                       0                      0                     0   \n",
       "\n",
       "    SaleCondition_Family  SaleCondition_Normal  SaleCondition_Partial  \n",
       "Id                                                                     \n",
       "1                      0                     1                      0  \n",
       "2                      0                     1                      0  \n",
       "3                      0                     1                      0  \n",
       "4                      0                     0                      0  \n",
       "5                      0                     1                      0  \n",
       "\n",
       "[5 rows x 303 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_dummy.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step4 建立模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据集划分\n",
    "x_train = data_all_dummy.loc[data_tra.index].values\n",
    "x_test = data_all_dummy.loc[data_tes.index].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Ridge Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.ensemble import RandomForestRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用cross vaildation方法来测试模型,存下所有的CV值，看看哪个alpha效果更好\n",
    "alphas = np.logspace(-3, 2, 50)\n",
    "test_scores = []\n",
    "for alpha in alphas:\n",
    "    clf = Ridge(alpha)\n",
    "    test_score = np.sqrt(-cross_val_score(clf, x_train, y_train, cv=10, scoring=\"neg_mean_squared_error\"))\n",
    "    test_scores.append(np.mean(test_score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x1fef746b278>]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAHYNJREFUeJzt3X1wXfV95/H3915ZT1fPlizLerAlbMACjM0qDiEJSROSmJCFbrbdwDZb2mGGyWzpdjeb6dBhh0zodGYT2iTtDu2GTdhtm93QQNPEm4UQQtm2m4YUE2ODDX7ANrYsW5af9Gw93e/+cY/ka3GvdI0lX/l3P68ZjXTOPVf6Ho746Ofv+Z1zzN0REZHCEMt3ASIicvko9EVECohCX0SkgCj0RUQKiEJfRKSAKPRFRAqIQl9EpIAo9EVECohCX0SkgBTlu4DZ6uvrfc2aNfkuQ0TkivLKK6+cdPeG+bZbcqG/Zs0atm3blu8yRESuKGb2di7bqb0jIlJAFPoiIgVEoS8iUkAU+iIiBUShLyJSQBT6IiIFRKEvIlJAggn9kfFJvvrjPWw/fCbfpYiILFnBhP7o+BR/8rf7ee1of75LERFZsoIJfRERmZ9CX0SkgCj0RUQKiEJfRKSAKPRFRAqIQl9EpIAEF/ru+a5ARGTpCib0zSzfJYiILHk5hb6ZbTGzPWa238wezPD6581st5ntNLMXzGx1tH61mb1iZq+a2S4z+9xC74CIiORu3tA3szjwGHA70AncY2adszbbDnS5+wbgaeAr0fpjwC3uvhF4L/Cgma1aqOJFROTi5DLS3wzsd/cD7j4OPAnclb6Bu7/o7iPR4ktAS7R+3N3HovUlOf48ERFZJLmEcDNwJG25O1qXzX3As9MLZtZqZjuj7/Fld+95N4WKiMilyyX0M50hzThHxsw+C3QBj85s6H4kavusBe41s8YM77vfzLaZ2ba+vr7cKhcRkYuWS+h3A61pyy3AO0brZnYb8BBwZ1pLZ0Y0wt8FfDDDa4+7e5e7dzU0NORau4iIXKRcQv9lYJ2ZtZtZMXA3sDV9AzPbBHyDVOCfSFvfYmZl0de1wPuBPQtVvIiIXJyi+TZw90kzewB4DogDT7j7LjN7BNjm7ltJtXMqgKei+fKH3f1OYD3wR2bmpNpEf+jury3SvoiIyDzmDX0Ad38GeGbWuofTvr4ty/ueBzZcSoEiIrJwgptC6boPg4hIVsGEvm7CICIyv2BCX0RE5qfQFxEpIAp9EZECotAXESkgCn0RkQISXOhrwqaISHbBhL4enCUiMr9gQl9EROan0BcRKSAKfRGRAhJc6OvWOyIi2QUT+qa774iIzCuY0J+mgb6ISHbhhL4G+iIi8won9CO6n76ISHbBhL4uzhIRmV8woS8iIvNT6IuIFBCFvohIAVHoi4gUkGBCf/o8ribviIhkF07oa/qOiMi8ggn9aa5rckVEsgom9DXOFxGZXzChP009fRGR7IIJfbX0RUTmF0zoT9NAX0Qku2BCX/fTFxGZXzChLyIi8wsu9HUiV0Qku2BCXydyRUTml1Pom9kWM9tjZvvN7MEMr3/ezHab2U4ze8HMVkfrN5rZz8xsV/TaZxZ6B2bTxVkiItnNG/pmFgceA24HOoF7zKxz1mbbgS533wA8DXwlWj8C/Lq7XwdsAb5uZjULVXwmau+IiGSXy0h/M7Df3Q+4+zjwJHBX+gbu/qK7j0SLLwEt0fq97r4v+roHOAE0LFTx6dTeERGZXy6h3wwcSVvujtZlcx/w7OyVZrYZKAbeyvDa/Wa2zcy29fX15VDSO2nKpojI/HIJ/UxpmrGJYmafBbqAR2etbwL+EvhNd0++45u5P+7uXe7e1dBwaf8Q0IPRRUSyK8phm26gNW25BeiZvZGZ3QY8BHzI3cfS1lcB/wf4T+7+0qWVm910e0eZLyKSXS4j/ZeBdWbWbmbFwN3A1vQNzGwT8A3gTnc/kba+GPgb4C/c/amFK/ud1NwREZnfvKHv7pPAA8BzwBvAd919l5k9YmZ3Rps9ClQAT5nZq2Y2/UfhXwG3Ar8RrX/VzDYu/G6k1buY31xE5AqXS3sHd38GeGbWuofTvr4ty/u+DXz7UgrM1fSTs9TeERHJLpwrcvNdgIjIFSCY0J+mK3JFRLILJvQ1e0dEZH4BhX7U089zHSIiS1kwoT9DQ30RkayCCn0zjfRFROYSVuijgb6IyFyCCv2YGUmlvohIVkGFvto7IiJzCyv0MbV3RETmEFToY7o4S0RkLkGFvoH6OyIicwgq9HUiV0RkbkGFvpmmbIqIzCWo0I+ZqbsjIjKHoELfQO0dEZE5hBX6au+IiMwpsNA3XKkvIpJVUKEfM0gq80VEsgos9E0XZ4mIzCGo0DczjfRFROYQVOjHDPX0RUTmEFjoG8lkvqsQEVm6Agt9zdMXEZlLUKFvZkwp9EVEsgoq9GMxdJdNEZE5hBX6GumLiMwpqNCPa8qmiMicggp9M0gq9UVEsgoq9OMxPURFRGQuQYV+zIwpjfRFRLIKLvQ10hcRyS6n0DezLWa2x8z2m9mDGV7/vJntNrOdZvaCma1Oe+1HZnbWzH64kIVnEo9ppC8iMpd5Q9/M4sBjwO1AJ3CPmXXO2mw70OXuG4Cnga+kvfYo8G8Wpty5xWPGlDJfRCSrXEb6m4H97n7A3ceBJ4G70jdw9xfdfSRafAloSXvtBWBwgeqdU2qkr5vviIhkk0voNwNH0pa7o3XZ3Ac8eylFvVtxncgVEZlTUQ7bWIZ1GZPVzD4LdAEfupgizOx+4H6Atra2i3nrBdTTFxGZWy4j/W6gNW25BeiZvZGZ3QY8BNzp7mMXU4S7P+7uXe7e1dDQcDFvvUBR3JhUe0dEJKtcQv9lYJ2ZtZtZMXA3sDV9AzPbBHyDVOCfWPgyc6ORvojI3OYNfXefBB4AngPeAL7r7rvM7BEzuzPa7FGgAnjKzF41s5k/Cmb2D8BTwEfNrNvMPrHgexEpihmTCn0Rkaxy6enj7s8Az8xa93Da17fN8d4PvuvqLpJG+iIicwvqityiWIyJKfX0RUSyCSr0NdIXEZlbUKFfFDcmdEmuiEhWQYX+slhMUzZFROYQVOjH42rviIjMJajQXxZTe0dEZC5BhX5RPMakZu+IiGQVVOgvi8eYUHtHRCSroEK/OG5MTCVxPT1LRCSjoEJ/WTyGOzqZKyKSRVChX1yU2h2dzBURySyo0F8WT+3OuE7miohkFFboRyP98UmFvohIJkGFfnE89ZAv3XRNRCSzsEI/GumPaaQvIpJRWKEfjwNq74iIZBNW6KunLyIyp6BCv2SmvTOV50pERJamoEJfI30RkbkFFfolOpErIjKnoEK/dFnqRO65CbV3REQyCTP01dMXEckosNBP7c65CbV3REQyCSv0i1Ij/TG1d0REMgor9GfaOxrpi4hkElToT8/eGR3XSF9EJJOgQj8WM0qKYpq9IyKSRVChD1BeHGdEI30RkYwCDP0ihb6ISBbBhX5ZcZzRicl8lyEisiSFF/rL1N4REckmvNBXT19EJKvgQj9RHNeUTRGRLHIKfTPbYmZ7zGy/mT2Y4fXPm9luM9tpZi+Y2eq01+41s33Rx70LWXwmiZIihsfU0xcRyWTe0DezOPAYcDvQCdxjZp2zNtsOdLn7BuBp4CvRe+uALwLvBTYDXzSz2oUr/50qSooYVOiLiGSUy0h/M7Df3Q+4+zjwJHBX+gbu/qK7j0SLLwEt0defAJ5399PufgZ4HtiyMKVnVqGRvohIVrmEfjNwJG25O1qXzX3As+/yvZcsUZKap59M+mL+GBGRK1JRDttYhnUZE9XMPgt0AR+6mPea2f3A/QBtbW05lJRdRUlql4bHJ6ksXXZJ30tEJDS5jPS7gda05RagZ/ZGZnYb8BBwp7uPXcx73f1xd+9y966GhoZca8+oojQV+kNq8YiIvEMuof8ysM7M2s2sGLgb2Jq+gZltAr5BKvBPpL30HPBxM6uNTuB+PFq3aKZH+kPnFPoiIrPN295x90kze4BUWMeBJ9x9l5k9Amxz963Ao0AF8JSZARx29zvd/bSZ/T6pPxwAj7j76UXZk0h1Waql0z86sZg/RkTkipRLTx93fwZ4Zta6h9O+vm2O9z4BPPFuC7xYVQp9EZGsgrsiVyN9EZHsgg39AYW+iMg7BBf6VdHsnf5RncgVEZktuNAviseoKClSe0dEJIPgQh9SLZ6zI+P5LkNEZMnJafbOlaYuUcxphb6IXAHcnZND4xw8OcxkMsktV9Uv6s8LNvRPDSn0RWTpGBqb5NDJYQ6cHOZg3zAHTw7NfD19Z+Abmqv537/9gUWtI8jQX54oZv+JoXyXISIFZmIqyZHTIxzoG+bgdMCfHOLgyWF6B8ZmtjOD5poy2usTfPqmZtrrE3Q0VNDRkFj0GoMM/bpEMaeGx+bfUETkIrk7vQNjHDg5NBPu0x+HT48wlXaH37pEMe31CW5d10B7Q4KO+gTt9RWsXl5O6bJ4XuoPM/Qrijk3kWRkfJLy4iB3UUQWWf/oRBTmQxzsG+atqBVz6NTwBc/hLl0Wo72+gs6mKu64oYmOhgTt9amPmvLiPO5BZkEmYn2iBIBTQ+OU1wW5iyKyAMYmp3j71EjaiD3VijnQN8yp4fPnBWMGrXXldNQnuLljedqoPcHKqlJisUx3kV+agkzE5RWpv659Q2O01pXnuRoRyadk0unpH50J8/Re+9Ezo6Q/b6mhsoT2+gQf62yc6bO31ydoqyunuCiMGe5Bhn5jVSkAvf3n8lyJiFwO7s6ZkYnUjJjpYI8+Hzo1zNhkcmbbRHGc9oYEm1pr+fSmlgvaMYXw4KUgQ7+pOhX6xwcU+iIhGR2fSjtxGk15jAI+/Sr8opjRtjzVjrn16vqZEXtHfYKGyhKiW8AXpCBDvy5RTHE8ptAXuQJNTiU5enaUA30XTnk82DdMz6x/va+sKqWjIcGnNjRF7ZgEHfUVtNSWURQPox2z0IIMfTOjsbqE42rviCxZY5OpUfve3iH29Q6yt3eQ/SeGOHx6hImp8432ytIiOhoqeG/H8tTJ06gds2Z5gkRJkBG2qIL9L7ayqlShL7IEpIf7/t5B9vYOsffEIG+fOj+nPWawZnmCtSsq+Fjnyplw76hPUJcoLuh2zEILN/Sry3it+2y+yxApGOOTySjcB6OR+xD7TgxyKEu4f/L6JtY1VnB1YyXt9Ym8XaxUaIIN/VU1pTz3+jmmkk78CppDK7LUjU8mOXQqFe7prZnZ4b56eYJ1Kyq4PQr3dSsq6WhQuOdbsKG/ui7B+FSS4wPnaK4py3c5Ilec9HDfF43a9/YOcejkMJNRuJvB6rpy1jVWsuX6lVzdWMnaFRVc1VChcF+igg39NctTF2W9fXJYoS8yh4mpJIeinvv0ydS9vYPRrX7fGe4f72zk6sZK1jUq3K9EwYb+6vrU3eoOnRrhlrV5LkZkCZiYSvL2qfPhvq83c7i31ZWzbkUlH4vCfe2KCtauULiHItjQb6oqpbgoxtunhvNdishllR7u+6KZMvuicJ+eCnk+3Cu4rbORq6Oe+1UNFZQVK9xDFmzox2JGW105hxT6Eqhk0uk+M8ruY/3sOZ7que/rHeLAyaELwr21tpyrGyv46PpG1q1IzZZRuBeuYEMfUn39gycV+nLlGx2fYk/vIG8cG+CNYwPs7hngzeODDEVPXJoO93UrKvila1dwdaPCXTILOvSvbqzk/+7pY2xyipIi/eLL0ufunBgcY3dauL9xbICDJ4dn7gZZUVLE+qZKPn1TM+ubqljfVMXVjRV6doTkJOjfks5VVUwmnX29Q1zfXJ3vckQuMDGV5K2+obRwT43k0+/j3lJbxvqmKu7YsIrOpio6m6poqS27ou7fLktL2KHfVAXA7mMDCn3Jq/6RCXYfG5gZwb9xbIB9vUOMT6Vu+VtcFOOaxko+un4FndHo/dqmKqrLwr/Vr1xeQYf+6uUJyovj7O4ZyHcpUiCSSefw6ZELwn13z8AFd4esryhhfVMlv/mBNTMB31Gf0F0h5bIIOvTjMePalZXsPqbQl4U3Mj7Jm8cvPLm65/ggw9HzU+Mxo6M+wXva62Z67+ubKllRWZrnyqWQBR36kOrr/2B7D8mkqw8q74q7c3zgXBTug+dPrp4axqOTq5UlRaxfVcWvdrWyvqmSzqZq1jXqgiZZeoIP/Y2ttXz7pcPsPTHItSur8l2OLHHuzrH+c7x2tJ/Xj/azszv1Of3kaltdOeubKrlz46qZ9kxLbZlu/ytXhOBD/31XLQfgp/tPKfTlAu5O78AYrx3t57Xus6nPR/s5OZQK+HjMWLeigo9cu4Lrm6vpXFXFtSsrC+I5qhKunELfzLYAfwzEgW+6+3+e9fqtwNeBDcDd7v502mtfBu6IFn/f3f9qIQrPVXNNGWuWl/Ozt05y3wfaL+ePliXmxMA5dnb3z4T7a0f76RscA1K3Al63opIPX7OCG5qruaGlms6mKrVnJDjzhr6ZxYHHgI8B3cDLZrbV3XenbXYY+A3gC7PeewdwE7ARKAH+zsyedffLemb1fVfV88MdPUxOJTVDokCcGDx3QXtmZ3c/J9ICfu2KCj64rp4NMwFfrStXpSDkMtLfDOx39wMAZvYkcBcwE/rufih6LTnrvZ3A37n7JDBpZjuALcB3L7303L1/7XK+80+Heb1ngI2tNZfzR8tl0Dc4xuvRyH065I8PpKZImsFVDRV8YG091zdXs6El1abR1atSqHL5zW8GjqQtdwPvzfH77wC+aGZfBcqBXyLtj8U0M7sfuB+gra0tx2+du/d1pPr6f7+3T6F/hTs1NN2DP9+iOdZ/PuA76hPc3FHHDS013NBczXWrqvTwbJE0ufzfkGlKgmdY986N3H9sZu8B/hHoA34GTGbY7nHgcYCurq6cvvfFWF5RwuY1dfzg1aP89kfWapbFFWJobJLXuvt59chZdhxJnWg9enZ05vWO+gSb2+tSPfjmaq5rrqZCAS8yp1z+D+kGWtOWW4CeXH+Au/8B8AcAZva/gH0XU+BC+fRNzTz4vdfY2d3PjRrtLzmTU0n29g6xo/ssrx4+y6tHzrLvxODMTcZWLy/nptW13HvLam5oruG65iqqNItG5KLlEvovA+vMrB04CtwN/Otcvnl0ErjG3U+Z2QZSs3t+/G6LvRSf3NDEw1t38b1fdCv082x6Lvz0CH77kbO81t3P6ETqStaa8mVsbK1hy/Ur2dhWw8aWGmoTxXmuWiQM84a+u0+a2QPAc6SmbD7h7rvM7BFgm7tvjVo4fwPUAv/czL7k7tcBy4B/iNopA8Bno5O6l11V6TI+3tnI1h09PHRHJ8VFmsVzuQyNTbIzCvcdR1Kj+OmZNMXxGJ2rqvjMe1rZ2FrDxtYaVi8vVwtOZJHk1AB192eAZ2atezjt65dJtX1mv+8cqRk8S8K/vKmFH+48xvO7e7ljQ1O+ywnS5FSSPb2DM6P4VJtmaOZ2Be31Cd6/tp4bW6rZ2FbL+qZKPetA5DIqqLNeH1xXT0dDgq/9ZC+fuK5Rc/YXwLH+UbYfPsv2w2fYcSQ1m2a6TVMbtWnuuGEVN7ZWs7G1hppytWlE8qmgQr8oHuN3P3Etn/v2Kzz9Sjd3b1746aEhOzcxxetH+/nF4TNR0J+dmQ9fXBTjulVV3L35fJumrU5tGpGlpqBCH+AT1zVyU1sNX/vJXu7a2KyrMLNwd46cHo0C/gzbj5xld88Ak9F0mta6Mt7bUcem1ho2ttXS2VSl8yQiV4CCC30z4/c+uZ5f/a8/41v/7wAPfGRdvktaEobHJtnRfXamVbP98NmZO0uWF8fZ0FLN/bd2sKmtlo2tNTRUluS5YhF5Nwou9AHes6aO269fydd/so9/trpu5k6chSKZdA6cHJ4Zwf/i7TPs7T0/J76jIcGHr1nBTatr2NRay9WNFTr/IRKIggx9gC//ygb2/+k/8rlvv8L3f+v9tNcn8l3Sojk5NMarh8+mLnyKZtUMnEvNnK0sLWJjaw0fv24lm9pq2KSTrSJBM/cFv+vBJenq6vJt27Zdlp91+NQIv/ynP6WmbBnf+7e3BBF2o+NT7OpJ3bpgel5895nUrQtiBtesrIpOtFZzU1stVzVU6IliIgEws1fcvWve7Qo59AFePnSaX/tvP+ealZX8l3s2seYKGvEnk85bfUMXXPT05vFBpqI+TXNN2cxUyY2ttVzfrLtLioRKoX8Rnt/dyxee2sHEVJKHP9XJZ97TuuSmGk5OJTlwcpjdPQPsPjYwc4/4obGoTVNSxIYo4G9sSU2ZXFGlB3CLFAqF/kU61j/KF57awU/3n+JjnY186c7rWFVTdtnrgNRtC948lgr36ZB/8/gg45OpxxUUx2Ncs7KSG1urubGlhk1tNXTUq00jUsgU+u9CMuk88dODfOVHe5hIJrm5fTn/YlMzW25YuSh3dBw8N8Hh0yMcOT3KW31D7OrpZ3fPAIdOjcxsU1O+jOtWVdHZVEXnqio6m6rpaEiwTLNpRCSNQv8SHDk9wl//opsfvNrDwZPDFBfF+Gj0cOzmmjKaa8torimjsaqUeIbRdTLpnJuc4txEkqFzk3SfGeHw6fMfR6LPZ0YmLnjf6uXlqXCfDvhVVaysKl1yrSYRWXoU+gvA3dnR3c/3tx/l2deP0TswdsHrRTFjZXUpZnBuIsm5iSnGJpKMT81+auT57Vtqy2itK6e1rpy2tI/Vy8up1P3hReRdUugvgpHxSXrOjtJ9ZpSjZ0c5emZ05lF9pctilBTFKV0Wp3RZLPW5KEZ5SREttWW01ZWzsqpUFzmJyKLINfQ1f+8ilBcXsXZFJWtXVOa7FBGRd0XDThGRAqLQFxEpIAp9EZECotAXESkgCn0RkQKi0BcRKSAKfRGRAqLQFxEpIEvuilwz6wPevoRvUQ+cXKByrhSFts+Ftr+gfS4Ul7LPq929Yb6NllzoXyoz25bLpcghKbR9LrT9Be1zobgc+6z2johIAVHoi4gUkBBD//F8F5AHhbbPhba/oH0uFIu+z8H19EVEJLsQR/oiIpJFMKFvZlvMbI+Z7TezB/Ndz2Iws1Yze9HM3jCzXWb2O9H6OjN73sz2RZ9r813rQjOzuJltN7MfRsvtZvbzaJ//ysyK813jQjKzGjN72szejI73+0I/zmb2H6Lf69fN7DtmVhracTazJ8zshJm9nrYu43G1lD+JMm2nmd20EDUEEfpmFgceA24HOoF7zKwzv1UtikngP7r7euBm4Lei/XwQeMHd1wEvRMuh+R3gjbTlLwNfi/b5DHBfXqpaPH8M/MjdrwVuJLXvwR5nM2sG/h3Q5e7XA3HgbsI7zv8D2DJrXbbjejuwLvq4H/izhSggiNAHNgP73f2Au48DTwJ35bmmBefux9z9F9HXg6SCoJnUvv55tNmfA7+cnwoXh5m1AHcA34yWDfgI8HS0SVD7bGZVwK3AtwDcfdzdzxL4cSb1JL8yMysCyoFjBHac3f3vgdOzVmc7rncBf+EpLwE1ZtZ0qTWEEvrNwJG05e5oXbDMbA2wCfg50OjuxyD1hwFYkb/KFsXXgd8Fpp84vxw46+6T0XJox7sD6AP+e9TS+qaZJQj4OLv7UeAPgcOkwr4feIWwj/O0bMd1UXItlNC3DOuCnZZkZhXAXwP/3t0H8l3PYjKzTwEn3P2V9NUZNg3peBcBNwF/5u6bgGECauVkEvWx7wLagVVAglR7Y7aQjvN8FuX3PJTQ7wZa05ZbgJ481bKozGwZqcD/n+7+vWh17/Q/+6LPJ/JV3yJ4P3CnmR0i1bb7CKmRf03UBoDwjnc30O3uP4+Wnyb1RyDk43wbcNDd+9x9AvgecAthH+dp2Y7rouRaKKH/MrAuOtNfTOoE0NY817Tgol72t4A33P2raS9tBe6Nvr4X+MHlrm2xuPvvuXuLu68hdVz/1t1/DXgR+JVos9D2+ThwxMyuiVZ9FNhNwMeZVFvnZjMrj37Pp/c52OOcJttx3Qr8ejSL52agf7oNdEncPYgP4JPAXuAt4KF817NI+/gBUv+82wm8Gn18klSP+wVgX/S5Lt+1LtL+fxj4YfR1B/BPwH7gKaAk3/Ut8L5uBLZFx/r7QG3oxxn4EvAm8Drwl0BJaMcZ+A6pcxYTpEby92U7rqTaO49FmfYaqZlNl1yDrsgVESkgobR3REQkBwp9EZECotAXESkgCn0RkQKi0BcRKSAKfRGRAqLQFxEpIAp9EZEC8v8B48SJVKU4wkAAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1fef0ace4e0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "plt.plot(alphas, test_scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Random Forest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_features = [0.1, 0.3, 0.5, 0.7, 0.9, 0.99]\n",
    "test_scores = []\n",
    "for max_feat in max_features:\n",
    "    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)\n",
    "    test_score = np.sqrt(-cross_val_score(clf, x_train, y_train, cv=5, scoring=\"neg_mean_squared_error\"))\n",
    "    test_scores.append(np.mean(test_score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<matplotlib.lines.Line2D at 0x1fef78e9898>]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAD8CAYAAAB3u9PLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3XlcVXX+x/HXRxAU3FBRc8ElcV8R0JzRymqyTVu0XFosy2pyalrHZqZmcqZpJqdptTGdrHHXzMzKMnPaJjdA3AA1NMTrigsqqCDcz++Pe+vHGMYRgcOFz/Px4BH3LN/zuSe873u+37OIqmKMMcbUcLsAY4wxlYMFgjHGGMACwRhjjJ8FgjHGGMACwRhjjJ8FgjHGGMACwRhjjJ8FgjHGGMBhIIjIYBHZKiLpIjKhmPkDRWSdiBSIyLAz5kWJyKcikiYiqSLSxj/9Mv8660XkvyLSvizekDHGmNKRkq5UFpEgYBtwBeABEoCRqppaZJk2QD3gMWCJqi4sMu8L4FlVXS4idQCvqp4QkW3AUFVNE5FfAvGqOuanamncuLG2adPmXN+jMcZUa0lJSQdVNbKk5YIdtBUPpKvqDgARmQcMBX4IBFXN8M/zFl1RRLoAwaq63L9cTpHZii9EAOoDe0oqpE2bNiQmJjoo2RhjzPdEZKeT5ZwEQgtgV5HXHqCvwzo6ANkisghoC3wGTFDVQuBuYKmInASOAf2Ka0BExgHjAKKiohxu1hhjzLlyMoYgxUxzeke8YGAAvq6kOKAdMMY/72HgalVtCbwF/KO4BlR1qqrGqmpsZGSJRzzGGGNKyUkgeIBWRV63xEH3TpF1k1V1h6oWAIuBGBGJBHqq6hr/cvOB/g7bNMYYUw6cBEICEC0ibUUkBBgBLHHYfgIQ4Q8AgEH4xh6OAPVFpIN/+hVAmvOyjTHGlLUSxxBUtUBExgPLgCBguqqmiMhEIFFVl4hIHPAeEAFcJyLPqGpXVS0UkceAFSIiQBIwzd/mPcC7/oHoI8Bd5fQejTHGOFDiaaeVSWxsrNpZRsYYc25EJElVY0tazq5UNsYYA1ggGGNMpXYwJ4+JH6SSm1dQ7tuyQDDGmEpKVXn8nQ3MWrMTz5GT5b49CwRjjKmk/r0yg8+3ZvHbqzrRsVndct+eBYIxxlRCaXuP8ZePt3Bpx0ju6N+mQrZpgWCMMZXMqdOFPDg3mXq1ajJpeE98Z+2XPyf3MjLGGFOBnv0ojW8P5PDvu+JpXCe0wrZrRwjGGFOJLE/dz8zVO7n75225uEPF3r/NAsEYYyqJ/cdO8cTCDXS5oB6PD+5Y4du3QDDGmErA61UeWbCeU6e9vDKyN6HBQRVegwWCMcZUAtO+3sE36Yf4w3VdaN+kjis1WCAYY4zLNnqymbRsK1d1a8Ytca1KXqGcWCAYY4yLcvMKeGjeeiLrhvLcjd0r7BTT4thpp8YY46JnPkgh41Auc+/pR4OwEFdrsSMEY4xxyYcb97Ag0cMDl7SnX7tGbpdjgWCMMW7wHDnBk4s20atVAx66PNrtcgCHgSAig0Vkq4iki8iEYuYPFJF1IlIgIsPOmBclIp+KSJqIpIpIG//0r0Vkvf9nj4gsLos3ZIwxlV1BoZeH569HFV4Z0ZuaQZXju3mJYwgiEgRMxvfcYw+QICJLVDW1yGKZwBjgsWKamAE8q6rLRaQO4AVQ1QFFtvEu8H5p34QxxgSSyZ9vJyHjCC/e0pOoRmFul/MDJ4PK8UC6qu4AEJF5wFDgh0BQ1Qz/PG/RFUWkCxCsqsv9y+Wc2biI1AUGAXeW7i0YY0zgSMw4zMsrtnF9r+bc0Lul2+X8DyfHKS2AXUVee/zTnOgAZIvIIhFJFpFJ/iOOom4AVqjqMYdtGmNMQDp26jQPzVtPi4jaTLy+m9vl/IiTQCjupFh12H4wMABfV1Ic0A5f11JRI4G5Z924yDgRSRSRxKysLIebNcaYykVV+d17m9l37BQv3dKberVqul3SjzgJBA9Q9NK5lsAeh+17gGRV3aGqBcBiIOb7mSLSCF+X1Edna0BVp6pqrKrGRkZW7J3/jDGmrCxat5sPNuzh15dF06d1hNvlFMtJICQA0SLSVkRCgBHAEoftJwARIvL9J/kgiow9AMOBD1X1lNOCjTEm0GQczOXp9zcT37Yhv7y0vdvlnFWJgeD/Zj8eWAakAQtUNUVEJorIEAARiRMRD74P+DdEJMW/biG+7qIVIrIJX/fTtCLNj+AnuouMMSbQnS708tC8ZIJqCC/d0ougGu7dmqIkjm5doapLgaVnTHu6yO8J+LqSilt3OdDjLPMucVqoMcYEoheXb2OD5yivj46heYPabpfzkyrH1RDGGFMFrdx+kH9+uZ0Rca24uvsFbpdTIgsEY4wpB0dy83lk/gbaNgrn6eu6uF2OIxYIxhhTxlSVCYs2cig3j1dG9iYsJDBuLG2BYIwxZWzu2l0sS9nPE1d2oluL+m6X45gFgjHGlKH0A8eZ+GEKA6IbM/bnbd0u55xYIBhjTBnJKyjkV3PXExYSzAvDe1KjEp9iWpzA6NgyxpgA8PwnW0nbe4w374ilSb1abpdzzuwIwRhjysAXWw/w5n+/446LWnNZ56Zul1MqFgjGGHOeso7n8dg7G+jYtC5PXt3Z7XJKzbqMjDHmPHi9ymPvbOD4qQJm392PWjXPvMN/4LAjBGOMOQ9vr8zgy21Z/P6aznRsVtftcs6LBYIxxpRS6p5j/PXjLVzeuQm39mvtdjnnzQLBGGNK4WR+IQ/OS6ZBWE2eH9YTkcA6xbQ4NoZgjDGl8OePUkk/kMOssX1pGB7idjllwo4QjDHmHC1L2cfsNZncO7AdP49u7HY5ZcYCwRhjzsG+o6f4zbsb6daiHo/+oqPb5ZQpCwRjjHGo0Ks8PH89eae9vDKiNyHBVesj1NG7EZHBIrJVRNJFZEIx8weKyDoRKRCRYWfMixKRT0UkTURSRaSNf7qIyLMiss0/78GyeEPGGFNepn61g1U7DvHMkK60i6zjdjllrsRBZREJAiYDVwAeIEFElqhqapHFMoEx+J6ffKYZwLOqulxE6gBe//QxQCugk6p6RaRJqd+FMcaUsw27snnh061c0/0ChscW+8TggOfkLKN4IF1VdwCIyDxgKPBDIKhqhn+et+iKItIFCPY/VxlVzSky+35glKp6/fMOlP5tGGNM+Tl+6jQPzUumSd1Q/nJD9ypximlxnHQZtQB2FXnt8U9zogOQLSKLRCRZRCb5jzgALgRuEZFEEflYRKKdl22MMRXD61Uenr+BXUdO8tKI3tQPq+l2SeXGSSAUF4XqsP1gYAC+rqQ4oB2+riKAUOCUqsYC04DpxW5cZJw/NBKzsrIcbtYYY8rGq/9J57O0/Tx1TWfi2zZ0u5xy5SQQPPj6+r/XEtjjsH0PkKyqO1S1AFgMxBSZ967/9/eAHsU1oKpTVTVWVWMjIyMdbtYYY87firT9vPjZNm6MacEd/du4XU65cxIICUC0iLQVkRBgBLDEYfsJQISIfP9JPoj/H3tY7H8NcDGwzWGbxhhT7nZk5fDreevp1qJelR43KKrEQPB/sx8PLAPSgAWqmiIiE0VkCICIxImIBxgOvCEiKf51C/F1F60QkU34up+m+Zv+K3CTf/pzwN1l+9aMMaZ0cvIKGDcziZrBNZhya5+AvqX1uRBVp8MB7ouNjdXExES3yzDGVGFer3L/7CQ+SzvAzLHx9L8w8G9NISJJ/vHan1S1LrMzxpjz9M8vt7MsZT9PXtWpSoTBubBAMMYYv8+3HODvn27l+l7NGfvztm6XU+EsEIwxBsg4mMuD85Lp3Kwez93Yo1oMIp/JAsEYU+3l5hUwbmYiQTWEN27rQ+2Q6jGIfCYLBGNMtaaqPL5wA+kHcnhtZAytGoa5XZJrLBCMMdXalC93sHTTPiZc1alKPeymNCwQjDHV1pfbsnh+2Rau7XEB9wxo53Y5rrNAMMZUSzsP5fLg3GQ6Nq3L88Oq5yDymSwQjDHVzon8Au6dmQTA1NtiCQtx8iSAqs8CwRhTragqTyzcyLb9x3l1ZG+iGlXfQeQzWSAYY6qVaV/v4MONe3n8yk4M7GB3UC7KAsEYU23899uD/PXjLVzdvRn3XWyDyGeyQDDGVAu7Dp9g/Nx1tG9Sh0nDetogcjEsEIwxVd7J/ELunZmE16tMvS2W8FAbRC6O7RVjTJWmqkxYtJG0fceYPiaONo3D3S6p0rIjBGNMlfbmf7/j/fV7eOwXHbm0YxO3y6nUHAWCiAwWka0iki4iE4qZP1BE1olIgYgMO2NelIh8KiJpIpIqIm38098Wke9EZL3/p1dZvCFjjPneyvSDPPfxFgZ3bcYvL7nQ7XIqvRK7jEQkCJgMXAF4gAQRWaKqqUUWywTG4Htc5plmAM+q6nIRqQN4i8x7XFUXlrZ4Y4w5G8+RE4yfm0zbxuH8/WYbRHbCyRhCPJCuqjsARGQeMBT4IRBUNcM/r+iHPSLSBQhW1eX+5XLKpmxjjDm7U6cLuW9WEqcLvEy9rQ91bBDZESddRi2AXUVee/zTnOgAZIvIIhFJFpFJ/iOO7z0rIhtF5EURCXXYpjHGnJWq8tv3NrF59zFeGtGLdpF13C4pYDgJhOKOs9Rh+8HAAHxdSXFAO3xdSwBPAp380xsCvyl24yLjRCRRRBKzsrIcbtYYU139e2UGi9bt5uHLO3BZ56ZulxNQnASCB2hV5HVLYI/D9j1AsqruUNUCYDEQA6Cqe9UnD3gLX9fUj6jqVFWNVdXYyEi7zNwYc3ardxziTx+lcUWXpvxqUHu3ywk4TgIhAYgWkbYiEgKMAJY4bD8BiBCR7z/JB+EfexCRC/z/FeB6YPO5FG6MMUXtyT7JA7PX0bpRGP+4uSc1atgg8rkqMRD83+zHA8uANGCBqqaIyEQRGQIgInEi4gGGA2+ISIp/3UJ83UUrRGQTvu6naf6mZ/unbQIaA38u27dmjKkuvh9EzivwMvW2WOrWqul2SQHJ0dC7qi4Flp4x7ekivyfg60oqbt3lQI9ipg86p0qNMaYYqsrvF29mo+coU2/rQ/smNohcWnalsjEmoM1avZOFSR4evCyaX3Rt5nY5Ac0CwRgTsBIyDvPMB6lc1qkJv74s2u1yAp4FgjEmIO07eor7Z62jVcMw/nFLLxtELgN2+Z4xJuDkFfgGkU/mFzD3nr7Ur22DyGXBAsEYE1BUlacXp7B+VzZTbo0humldt0uqMqzLyBgTUOaszWR+4i7GX9qewd0ucLucKsUCwRgTMJJ2HuaPS1K4pGMkD1/Rwe1yqhwLBGNMQNh/7BT3zVpH8wa1efmW3gTZIHKZszEEY0yll1/g5f5ZSeTmFTBrbF/qh9kgcnmwQDDGVHp//CCFdZnZTB4VQ8dmNohcXqzLyBhTqc1dm8mcNZncd/GFXNPDBpHLkwWCMabSWpd5hD+8n8KA6MY8fmVHt8up8iwQjDGV0oHjp7h/VhJN64fy6kgbRK4INoZgjKl08gu8PDB7HcdOFrDol/1pEBbidknVggWCMabS+fNHqSRkHOGVkb3pfEE9t8upNqzLyBhTqSxI3MWMVTsZN7AdQ3o2d7ucasUCwRhTaWzYlc3vF2/mZ+0b8YQNIlc4R4EgIoNFZKuIpIvIhGLmDxSRdSJSICLDzpgXJSKfikiaiKSKSJsz5r8qIjnn8yaMMYEv63ge985MIrJOKK+OjCE4yL6vVrQS97iIBAGTgauALsBIEelyxmKZwBhgTjFNzAAmqWpnIB44UKTtWKBBqSo3xlQZpwu9PDBnHdkn85l6ex8ahtsgshucRHA8kK6qO1Q1H5gHDC26gKpmqOpGwFt0uj84gv3PVUZVc1T1hH9eEDAJeOL834YxJpA9+1Eaa787zN9u6kHX5vXdLqfachIILYBdRV57/NOc6ABki8giEUkWkUn+IAAYDyxR1b3OyzXGVDXvJnl4e2UGY3/elqG9nH60mPLg5LTT4q4G0XNofwDQG1+30nxgjIh8DAwHLilx4yLjgHEAUVFRDjdrjAkEmzxH+e17m+jXriFPXtXJ7XKqPSdHCB6gVZHXLYE9Dtv3AMn+7qYCYDEQgy8g2gPpIpIBhIlIenENqOpUVY1V1djIyEiHmzXGVHaHcvK4b1YSjcJDmDzKBpErAydHCAlAtIi0BXYDI4BRDttPACJEJFJVs4BBQKKqfgQ0+34hEclR1fbnVroxJlAVFHoZPyeZgzl5LLyvP43qhLpdksHBEYL/m/14YBmQBixQ1RQRmSgiQwBEJE5EPPi6gd4QkRT/uoXAY8AKEdmEr/tpWvm8FWNMoHju4y2s2nGI527sTveWNohcWTi6dYWqLgWWnjHt6SK/J+DrSipu3eVAjxLar+OkDmNM4FucvJs3//sdY/q34caYYj82jEus084YU2E27z7KhEUbiW/bkN9d09ntcswZLBCMMRXicG4+985MIiLMN4hc0waRKx2726kxptwVFHr51dx1ZOXk8c69FxFZ1waRKyOLaGNMuZu0bCvfpB/iz9d3o2cru1tNZWWBYIwpV0s27OGNr3ZwW7/W3BzbquQVjGssEIwx5SZ1zzGeWLiBuDYRPHXtmffENJWNBYIxplxkn8jn3lmJ1K9dk8mjYwgJto+bys4GlY0xZa7Qq/xqbjL7j+Yx795+NKlby+2SjAMWCMaYMvf3T7fy9bcH+euN3YmJinC7HOOQHcMZY8rURxv38s8vtjOqbxQj4u0OxYHEAsEYU2a27jvO4ws3EBPVgD9cZ4PIgcYCwRhTJo6eOM24mYmEhwbzz1v7EBocVPJKplKxQDDGnLdCr/LQ/GT2ZJ9kyq0xNK1ng8iByAaVjTHn7cXl2/hiaxbP3tCNPq0bul2OKSU7QjDGnJdPNu/ltc/TGRHXilE2iBzQLBCMMaX27f7jPLpgA71aNeCZoV0RKe4R7CZQOAoEERksIltFJF1EJhQzf6CIrBORAhEZdsa8KBH5VETSRCRVRNr4p78pIhtEZKOILBQRe0iOMQHk2KnTjJuZRO2QYKbYIHKVUGIgiEgQMBm4CugCjBSRM88nywTGAHOKaWIGMElVOwPxwAH/9IdVtaeq9vCvP75U78AYU+G8XuXheevZdfgEr4+OoVl9G0SuCpwMKscD6aq6A0BE5gFDgdTvF1DVDP88b9EV/cER7H+MJqqaU2SdY/5lBKgN6Pm8EWNMxXlpxbes2HKAiUO7Et/WBpGrCiddRi2AXUVee/zTnOgAZIvIIhFJFpFJ/iMOAETkLWAf0Al41WGbxhgXfZqyj1dWfMvwPi25rV9rt8sxZchJIBQ3SuT023wwMAB4DIgD2uHrWvI1onon0BxIA24pduMi40QkUUQSs7KyHG7WGFMe0g/k8MiCDfRoWZ8/Xd/NBpGrGCeB4AGKPtWiJbDHYfseIFlVd6hqAbAYiCm6gKoWAvOBm4prQFWnqmqsqsZGRkY63KwxpqwdP+W7Ejk0uAZTbu1DrZo2iFzVOAmEBCBaRNqKSAgwAljisP0EIEJEvv8kHwSkik97+GEM4Tpgy7mVboypKF6v8siCDew8dILJo2No3qC22yWZclBiIPi/2Y8HluHr2lmgqikiMlFEhgCISJyIeIDhwBsikuJftxBfd9EKEdmEr/tpmv+///ZP2wRcAEws83dnjCkTr32ezvLU/fz+ms70a9fI7XJMORHVwDm5JzY2VhMTE90uw5hqZUXafu6ekcgNvVrwws09bdwgAIlIkqrGlrScXalsjDmrHVk5/Hreero2r8dfbuxuYVDFWSAYY4qVk1fAuJlJ1LRB5GrD7nZqjPkRr1d5dMF6vjuYy8yx8bSMCHO7JFMB7AjBGPM/Dubk8ciC9SxL2c+TV3Wi/4WN3S7JVBA7QjDGAL6H3MxZs5NJy7Zy8nQhD14Wzdift3W7LFOBLBCMMSRnHuGp9zezefcx+l/YiIlDu9G+id2AuLqxQDCmGjucm8/zn2xhXsIumtYL5dWRvbm2xwV2NlE1ZYFgTDXk9SrzEnbx/LIt5JwqYNzAdjx4WTR1Qu0joTqz//vGVDMbPdk8tXgzGzxH6du2IX+6vhsdmtZ1uyxTCVSLQFi94xAn8wu5tFMTt0sxxjXZJ/KZtGwrc9Zm0ig8lJdu6cXQXs2te8j8oMoHgqry8mffkpBxmEnDe3BD75Zul2RMhfJ6lYVJHv76yRayT+Qzpn8bHr6iA/Vq1XS7NFPJVPlAEBGm3t6HcTOSeHj+Bg7nnrZT6Uy1kbLnKE8t3sy6zGxiW0cwcWhfujSv53ZZppKq8oEAULdWTd66M46H56/nTx+mcjg3j8d+0dEOlU2VdfTkaV5cvo0ZqzKICAvh78N7cmPvFtSoYX/z5uyqRSAA1KoZxGujYnjq/c1M/nw7h3Ly+fP13QgOsou1TdWhqryXvJu/LN3C4dw8bu3Xmkev6Ej9MOseMiWrNoEAEFRDePb6bjQKD+HV/6Rz5EQ+L4/obTftMlXCln3HeHpxCmszDtOrVQPevjOObi3qu12WCSDVKhDAN6bw6C860jA8hGc+SGXMW2uZdnssdW2AzQSo46dO89Jn3/L2ygzq1Qrmrzd25+bYVtY9ZM5ZtQuE7935s7Y0DA/h0QUbGDF1NW/fGU9k3VC3yzLGMVVlyYY9PPtRGlk5eYyIi+KJKzsSER7idmkmQDnqQBeRwSKyVUTSRWRCMfMHisg6ESkQkWFnzIsSkU9FJE1EUkWkjX/6bH+bm0VkuohU+Ff0ob1a8K87YtmRlcuwKSvJPHSiokswplS+3X+cUdPW8NC89TStV4v3fvkznruxu4WBOS8lBoKIBAGTgauALsBIEelyxmKZwBhgTjFNzAAmqWpnIB444J8+G+gEdAdqA3eXov7zdknHJsy+py9HT57mpikrSdt7zI0yjHEkN6+A5z5O46qXvyZ17zH+fH03Fj/wM3q1auB2aaYKcHKEEA+kq+oOVc0H5gFDiy6gqhmquhHwFp3uD45gVV3uXy5HVU/4f1+qfsBawLUrxmKiInjn3osIriHc/MYq1n532K1SjCmWqvLRxr1c9sKXvPHlDm6MacF/Hr2YW/u1JsjGCkwZcRIILYBdRV57/NOc6ABki8giEUkWkUn+I44f+LuKbgM+cdhmuYhuWpeF9/cnsm4ot725hs9S97tZjjE/2J6Vw+3T1/LAnHU0DA/h3fv78/ywnjSqY2Nepmw5CYTivn6ow/aDgQHAY0Ac0A5f11JRrwNfqerXxW5cZJyIJIpIYlZWlsPNlk6LBrVZeF9/OjWry72zkngncVfJKxlTTk7mFzJp2RYGv/QV63dl88yQriwZ/zP6tI5wuzRTRTkJBA/QqsjrlsAeh+17gGR/d1MBsBiI+X6miPwBiAQeOVsDqjpVVWNVNTYyMtLhZkuvYXgIc+7pR/8LG/H4wo288eX2ct+mMUWpKstS9nH5P75k8ufbua5Hc/7z6CXc0b+NXUhpypWT004TgGgRaQvsBkYAoxy2nwBEiEikqmYBg4BEABG5G7gSuExVvT/RRoULDw3mzTvieGTBep77eAuHcvN58qpOdqsLU+52HsrlD0tS+GJrFh2b1mX+uH70bdfI7bJMNVFiIKhqgYiMB5YBQcB0VU0RkYlAoqouEZE44D0gArhORJ5R1a6qWigijwErxPdpmgRM8zc9BdgJrPJ/0C5S1Yll/g5LKSS4Bi+P6E3D8BCmfrWDQzn5/O2m7vYNzZSLU6cLef2L7Uz5cjshQTX4/TWduaN/G2ra35upQOI7yScwxMbGamJiYoVuU1V5ZUU6L362jcs6NWHy6Bi71YUpUyvS9vPHD1LYdfgkQ3o253fXdKZpvVpul2WqEBFJUtXYkpartlcqOyUiPHR5NA3rhPD0+5u57c01/OuOOOrXtltdmPOz6/AJnvkglc/S9tO+SR3m3NOX/hc2drssU41ZIDh0W7/WRITV5OH567nljVXMuCueJvYtzpTCqdOFTP1qB5M/TyeohjDhqk7c9bO2hARb95BxlwXCObi2R3Ma1A5h3MxEbvznSmaO7UvbxuFul2UCyJfbsvjD+5vJOHSCa7pfwO+u6UzzBrXdLssYwOG9jMz/+3l0Y+be048T+YUMn7KSzbuPul2SCQC7s09y38wk7pi+lhoizLgrnsmjYywMTKVigVAKPVs14J37LiI0OIgRU1ezavsht0sylVR+gZfXv0jn8he+5IttB3j8yo58/OsBDOxQ/tfUGHOuLBBK6cLIOrx7f3+aN6jFHdPX8snmvW6XZCqZb9IPMvjlr3j+k60MiG7MZ49czAOXtic02M5SM5WTBcJ5aFa/FgvuvYhuLerxy9nrmLs20+2STCWw7+gpHpizjtH/WkOhV3lrTBxTb4+lZUSY26UZ85NsUPk8NQgLYdbdffnl7HU8uWgTh3Pz+eUlF9pVzdXQ6UIvb3+TwUufbaPAqzx8eQfuvbidXbdiAoYFQhkICwlm2u2xPLFwI5OWbeVgTh5PXdPFHmFYjazafoin39/MtwdyGNSpCX+8ritRjeyIwAQWC4QyUjOoBi8M70lEWAjTv/mOI7n5TBre0249UMUdOHaKvyxNY/H6PbRoUJtpt8dyRZembpdlTKlYIJShGjWEp67tTKM6IUxatpXsk6d5fXQMYSG2m6uagkIvM1bt5MXl28gr8PLgoPbcf0l7aodY95AJXPZJVcZEhAcubU+j8BB++94mRv9rDW+NiaNBmD3rtqpIzDjM7xdvZsu+4wzsEMkzQ7raBYqmSrBAKCcj4qNoEBbCg/OSGT5lFTPGxnNBfbsIKZAdzMnjuaVbeHedh+b1azHl1hiu7NrMTiAwVYZ1cJejwd2a8e8749l79BTD/rmK7Vk5bpdkSqHQq8xYlcGlf/+CJRt2c/8lF/LZoxczuNsFFgamSrFAKGcXXdiIeeP6kVdQyPApq9iwK9vtksw5WJd5hCGv/Zen30+hR8v6fPzQQH4zuJONC5kqyQKhAnRrUZ+F9/UnPDSIkdNW8/W35ftsaHP+DufmM+Hdjdz4+koO5uTx2qjezBrbl/ZN6rhdmjHlxgKhgrRpHM679/UnqmEYd72dwIcbnT6W2lSkQq8yZ03+GG58AAANzklEQVQmg174goVJHsYNbMeKRy/h2h7NrXvIVHmOAkFEBovIVhFJF5EJxcwfKCLrRKRARIadMS9KRD4VkTQRSRWRNv7p4/3tqYhUi6eCNKlXi/n3XkTvVhH8am4yM1dluF2SKWKjJ5sbX/+G3763iY5N67L0oQH89urO1Am17iFTPZT4ly4iQcBk4ArAAySIyBJVTS2yWCYwBnismCZmAM+q6nIRqQN4/dO/AT4Evih19QGofu2azBgbz/g563jq/RQO5ebz0GXR9u3TRal7jvH2yu94J8lD4zqhvHRLL4b2siMCU/04+eoTD6Sr6g4AEZkHDAV+CARVzfDP8xZdUUS6AMGquty/XE6RdZL9y5zfOwhAtWoGMeXWPkxYtImXPvuWQzn5/HFIV4LsVhcV5tTpQj7auJfZa3ayLjOb0OAa3Nm/Lb++Ipp6tezxqKZ6chIILYBdRV57gL4O2+8AZIvIIqAt8BkwQVULnRYoIuOAcQBRUVFOV6v0goNqMGlYDxqFh/DGVzs4fCKff9zc026NXM62Z+UwZ00mC5M8HD15mnaR4Tx1bRduimlhFw+aas9JIBT3tVXPof0BQG983Urz8XUtvelwfVR1KjAVIDY21ul2A4KI8OTVnWkYHsJzH2/h6InTTLmtj/VZl7H8Ai/LU/cze81OVm4/RM0g4cquzRjdtzX92jWslkepxhTHySePB2hV5HVLwOkpMh4guUh302KgH+cQCNXBvRdfSMPwECYs2sToaat56854Gobbt9XztevwCeYlZDI/wcPBnDxaRtTmicEdGd6nFZF1Q90uz5hKx0kgJADRItIW2A2MAEY5bD8BiBCRSFXNAgYBiaWqtIobHtuKiLAQHpizjmFTVjJzbF9a2PN2z1mhV/l8ywFmr9nJF9uyEGBQp6aM7hfFwOhIG6cx5ieIasm9MCJyNfASEARMV9VnRWQikKiqS0QkDngPiABOAftUtat/3SuAF/B1PSUB41Q1X0QeBJ4AmgEHgKWqevdP1REbG6uJiVU7TxIyDnPX2wmEhwQzc2w80U3rul1SQDhw7BTzE3Yxd20me46eokndUEbERzEirpU9yN5UeyKSpKqxJS7nJBAqi+oQCABpe49x+/S15Bd4mT4mjj6tI9wuqVLyepWV2w8xe81Olqfup8CrDIhuzOi+UVzWuak9i8IYPwuEALfr8Alue3MN+4/l8fqtMVzasYnbJVUaR3LzWZjkYc7aTL47mEtEWE1ujm3FyPgo2thtqI35EQuEKiDreB5j3lrL1n3H+fvwnlzfu4XbJblGVUnaeYTZazL5aNNe8gu8xLWJYHTf1gzu1syeW2zMT3AaCHZ+YyUWWTeUeeP6cc+MRH49fz1HTuRz58/aul1WhTp26jSLk3cze3UmW/cfp25oMCPjWjGqb2s6NrPxFWPKkgVCJVe3Vk3evjOeh+Yl88wHqRzKyefRX3So8ufOb959lNlrdvL++j2cyC+ke4v6/O2m7lzXs7ndetqYcmL/sgJArZpBvD66D79fvInXPk/nUG4ef76+e5U7hfJkfiEfbNjD7DU72eA5Sq2aNRjaswWj+0XRo2UDt8szpsqzQAgQQTWEv9zQnUbhobz2eTpHck/z0oheVaLv/Nv9x5m9JpN313k4fqqA6CZ1eGZIV67v3YL6te2+QsZUFAuEACIiPHZlRxqGhzDxw1TufCuBqbf3oW4A3owtr6CQTzbvY/aaTNZ+d5iQoBpc3b0Zo/u1JrZ1RJXvEjOmMrJACEB3/bwtDcNDeOydDYyYupq374wPmFsx7DyUy5y1mbyT6OFwbj6tG4Xx5FWdGNanJY3qBMZ7MKaqskAIUNf3bkH9sJrcPyuJ4f5bXbRqGOZ2WcUqKPSyYssBZq3eydffHiSohnBFZ9/tJH52YWNqVLGxEGMClV2HEOCSdh7hrrcTCAmuwYy74ul8QT23S/rB3qMnmbd2F/MSMtl/LI8L6tdiRFwUt8S1oln9Wm6XZ0y1YRemVSPb9h/n9jfXkptfwPQxccS1aehaLV6v8nX6QWat3smKtP0ocHGHSEb3bc2lHSMJtttJGFPhLBCqGc+RE9z+5lp2Z59k8qgYLu/StEK3fzAnj3cSPcxZu5Ndh0/SKDyEm+NaMTIuiqhGlbMry5jqwgKhGjqUk8edbyeQsucYf7upB8P6tCzX7akqa787zKw1mXyyeS+nC5V+7Royum9rruzajJBgOxowpjKwW1dUQ43qhDLnnn7cNzOJx97ZwOHcPMYNvLDMt3P0xGkWJXuYvSaT9AM51KsVzG392jCqbxTtm9Qp8+0ZYyqGBUIVUyc0mDfHxPLIgg38ZekWDuXkM+GqTud9Xr+qssFzlNmrd/LBxj2cOu2lV6sGTBrWg2t7NKd2SOBfIGdMdWeBUAWFBgfxyojeRITV5I2vdnAoN5+/3ti9VAO6uXkFvL/edzuJlD3HCAsJ4saYloyKj6Jbi/rlUL0xxi2OAkFEBgMv43ti2r9U9a9nzB+I74lqPYARqrqwyLwo4F/4nsuswNWqmuF/JOc8oCGwDrhNVfPP/y0Z8N3q4k9Du9EoPJSXV3xL9onTvDaqt+NbXWzZd4zZqzN5L3k3OXkFdGpWlz9d343rezUPyCujjTElKzEQRCQImAxcAXiABBFZoqqpRRbLBMYAjxXTxAzgWVVdLiJ1AK9/+t+AF1V1nohMAcYC/yz1OzE/IiI8fEUHGtUJ4Q9LUrj9zbVMuyP2rPcHOnW6kKWb9jJ7TSZJO48QElyDa3tcwOi+rYmJamC3kzCminNyhBAPpKvqDgARmQcMBX4IBFXN8M/zFl1RRLoAwaq63L9cjn+6AIOAUf5F/w38EQuEcnH7RW2ICAvhkQXrueWNVcy4K54m9f7/wrDvDuYyZ81O3knykH3iNO0ah/P7azozrE9LGoSFuFi5MaYiOQmEFsCuIq89QF+H7XcAskVkEdAW+AyYAEQA2apaUKTN6vs4sApwXc/mNAiryb0zk7hpykreGhPPtv3Hmb1mJ9+kHyK4hnBl12aM7hvFRRc2sqMBY6ohJ4FQ3CeD04sXgoEBQG983Urz8XUtLXHapoiMA8YBREVFOdysKc6A6Ejm3tOPMW+t5fJ/fAlAiwa1efzKjgyPbUmTunY7CWOqMyeB4ME3IPy9lsAeh+17gOQi3U2LgX7AdKCBiAT7jxLO2qaqTgWmgu/CNIfbNWfRs1UD3rmvP9O/+Y7LOzfh4g5NqtyDdowxpePkPMQEIFpE2opICDCC4r/hn23dCBGJ9L8eBKSq7/Loz4Fh/ul3AO87L9ucj/ZN6vCXG7ozqFNTCwNjzA9KDAT/N/jxwDIgDVigqikiMlFEhgCISJyIeIDhwBsikuJftxDfmUcrRGQTvu6naf6mfwM8IiLpQCPgzbJ9a8YYY86F3cvIGGOqOKf3MrK7jxljjAEsEIwxxvhZIBhjjAEsEIwxxvhZIBhjjAEsEIwxxvgF1GmnIpIF7HS7jvPUGDjodhGVjO2T/2X743/Z/vixc90nrVU1sqSFAioQqgIRSXRyPnB1Yvvkf9n++F+2P36svPaJdRkZY4wBLBCMMcb4WSBUvKluF1AJ2T75X7Y//pftjx8rl31iYwjGGGMAO0IwxhjjZ4FQTkRksIhsFZF0EZlQzPxHRCRVRDaKyAoRae1GnRWlpP1RZLlhIqIiUuXPKnGyT0TkZv/fSYqIzKnoGiuSg38zUSLyuYgk+//dXO1GnRVFRKaLyAER2XyW+SIir/j310YRiTnvjaqq/ZTxDxAEbAfaASHABqDLGctcCoT5f78fmO923W7uD/9ydYGvgNVArNt1u71PgGggGYjwv27idt0u74+pwP3+37sAGW7XXc77ZCAQA2w+y/yrgY/xPWemH7DmfLdpRwjlIx5IV9UdqpoPzAOGFl1AVT9X1RP+l6vxPUa0qipxf/j9CXgeOFWRxbnEyT65B5isqkcAVPVABddYkZzsDwXq+X+vj/NH+QYkVf0KOPwTiwwFZqjPanyPJb7gfLZpgVA+WgC7irz2+KedzVh8SV9Vlbg/RKQ30EpVP6zIwlzk5G+kA9BBRL4RkdUiMrjCqqt4TvbHH4Fb/U9nXAr8qmJKq7TO9XOmRMHnVY45m+IeVFzs6VwicisQC1xcrhW56yf3h4jUAF4ExlRUQZWAk7+RYHzdRpfgO4L8WkS6qWp2OdfmBif7YyTwtqq+ICIXATP9+8Nb/uVVSo4/Z5yyI4Ty4QFaFXndkmIOb0XkcuB3wBBVzaug2txQ0v6oC3QDvhCRDHz9oUuq+MCyk78RD/C+qp5W1e+ArfgCoipysj/GAgsAVHUVUAvfPX2qK0efM+fCAqF8JADRItJWREKAEcCSogv4u0jewBcGVblvGErYH6p6VFUbq2obVW2Db0xliKpW5Qdol/g3AizGd/IBItIYXxfSjgqtsuI42R+ZwGUAItIZXyBkVWiVlcsS4Hb/2Ub9gKOquvd8GrQuo3KgqgUiMh5Yhu/siemqmiIiE4FEVV0CTALqAO+ICECmqg5xrehy5HB/VCsO98ky4BcikgoUAo+r6iH3qi4/DvfHo8A0EXkYX9fIGPWfblMVichcfN2Fjf3jJn8AagKo6hR84yhXA+nACeDO895mFd6fxhhjzoF1GRljjAEsEIwxxvhZIBhjjAEsEIwxxvhZIBhjjAEsEIwxxvhZIBhjjAEsEIwxxvj9H3eVMBpj/wVJAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x1fef5e976a0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(max_features, test_scores)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step 5 Ensemble\n",
    "#### 汲取两种或多种模型的优点，把最好的参数拿出来，做成我们的最终模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Ridge(alpha=15, copy_X=True, fit_intercept=True, max_iter=None,\n",
       "   normalize=False, random_state=None, solver='auto', tol=0.001)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#ridge回归训练\n",
    "ridge = Ridge(alpha=15)\n",
    "ridge.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "           max_features=0.3, max_leaf_nodes=None,\n",
       "           min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "           min_samples_leaf=1, min_samples_split=2,\n",
       "           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,\n",
       "           oob_score=False, random_state=None, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#RandomForest训练\n",
    "rf = RandomForestRegressor(n_estimators=500, max_features=0.3)\n",
    "rf.fit(x_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 因为最前面我们把label值log(1+x)了，于是这里我们要把预测值exp回去，用expm1()函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_ridge = np.expm1(ridge.predict(x_test))\n",
    "y_rf= np.expm1(rf.predict(x_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 一个正经的ensemble是把这群model预测的预测结果作为新的input，再做一次预测，这里用简单方法，直接平均化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_final = (y_ridge + y_rf) / 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## step6 提交结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df = pd.DataFrame(data={\"Id\" : data_tes.index, \"SalePrice\" : y_final})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1461</td>\n",
       "      <td>116679.941954</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1462</td>\n",
       "      <td>150720.987423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1463</td>\n",
       "      <td>176293.174518</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1464</td>\n",
       "      <td>189365.173195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1465</td>\n",
       "      <td>188568.912643</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     Id      SalePrice\n",
       "0  1461  116679.941954\n",
       "1  1462  150720.987423\n",
       "2  1463  176293.174518\n",
       "3  1464  189365.173195\n",
       "4  1465  188568.912643"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "submission_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df.to_csv(\"sample_submission.csv\", index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
